<!DOCTYPE html>
<html lang="zh-CN">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
  <meta name="theme-color" content="#222">
  <meta name="generator" content="Hexo 4.2.1">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/safari-pinned-tab.svg" color="#222">
  <link rel="stylesheet" href="/css/main.css">
  <link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
  <link rel="stylesheet" href="/lib/pace/pace-theme-minimal.min.css">
  <script src="/lib/pace/pace.min.js"></script>
  <script id="hexo-configurations">
    var NexT = window.NexT ||
    {};
    var CONFIG = {
      "hostname": "cuiqingcai.com",
      "root": "/",
      "scheme": "Pisces",
      "version": "7.8.0",
      "exturl": false,
      "sidebar":
      {
        "position": "right",
        "width": 360,
        "display": "post",
        "padding": 18,
        "offset": 12,
        "onmobile": false,
        "widgets": [
          {
            "type": "image",
            "name": "阿布云",
            "enable": false,
            "url": "https://www.abuyun.com/http-proxy/introduce.html",
            "src": "https://qiniu.cuiqingcai.com/88au8.jpg",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "天验",
            "enable": true,
            "url": "https://tutorial.lengyue.video/?coupon=12ef4b1a-a3db-11ea-bb37-0242ac130002_cqx_850",
            "src": "https://qiniu.cuiqingcai.com/bco2a.png",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "华为云",
            "enable": false,
            "url": "https://activity.huaweicloud.com/2020_618_promotion/index.html?bpName=5f9f98a29e2c40b780c1793086f29fe2&bindType=1&salesID=wangyubei",
            "src": "https://qiniu.cuiqingcai.com/y42ik.jpg",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "张小鸡",
            "enable": false,
            "url": "http://www.zxiaoji.com/",
            "src": "https://qiniu.cuiqingcai.com/fm72f.png",
            "width": "100%"
      },
          {
            "type": "image",
            "name": "Luminati",
            "src": "https://qiniu.cuiqingcai.com/ikkq9.jpg",
            "url": "https://luminati-china.io/?affiliate=ref_5fbbaaa9647883f5c6f77095",
            "width": "100%",
            "enable": false
      },
          {
            "type": "image",
            "name": "IPIDEA",
            "url": "http://www.ipidea.net/?utm-source=cqc&utm-keyword=?cqc",
            "src": "https://qiniu.cuiqingcai.com/0ywun.png",
            "width": "100%",
            "enable": true
      },
          {
            "type": "tags",
            "name": "标签云",
            "enable": true
      },
          {
            "type": "categories",
            "name": "分类",
            "enable": true
      },
          {
            "type": "friends",
            "name": "友情链接",
            "enable": true
      },
          {
            "type": "hot",
            "name": "猜你喜欢",
            "enable": true
      }]
      },
      "copycode":
      {
        "enable": true,
        "show_result": true,
        "style": "mac"
      },
      "back2top":
      {
        "enable": true,
        "sidebar": false,
        "scrollpercent": true
      },
      "bookmark":
      {
        "enable": false,
        "color": "#222",
        "save": "auto"
      },
      "fancybox": false,
      "mediumzoom": false,
      "lazyload": false,
      "pangu": true,
      "comments":
      {
        "style": "tabs",
        "active": "gitalk",
        "storage": true,
        "lazyload": false,
        "nav": null,
        "activeClass": "gitalk"
      },
      "algolia":
      {
        "hits":
        {
          "per_page": 10
        },
        "labels":
        {
          "input_placeholder": "Search for Posts",
          "hits_empty": "We didn't find any results for the search: ${query}",
          "hits_stats": "${hits} results found in ${time} ms"
        }
      },
      "localsearch":
      {
        "enable": true,
        "trigger": "auto",
        "top_n_per_article": 10,
        "unescape": false,
        "preload": false
      },
      "motion":
      {
        "enable": false,
        "async": false,
        "transition":
        {
          "post_block": "bounceDownIn",
          "post_header": "slideDownIn",
          "post_body": "slideDownIn",
          "coll_header": "slideLeftIn",
          "sidebar": "slideUpIn"
        }
      },
      "path": "search.xml"
    };

  </script>
  <meta name="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
  <meta property="og:type" content="website">
  <meta property="og:title" content="静觅">
  <meta property="og:url" content="https://cuiqingcai.com/page/12/index.html">
  <meta property="og:site_name" content="静觅">
  <meta property="og:description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
  <meta property="og:locale" content="zh_CN">
  <meta property="article:author" content="崔庆才">
  <meta property="article:tag" content="崔庆才">
  <meta property="article:tag" content="静觅">
  <meta property="article:tag" content="PHP">
  <meta property="article:tag" content="Java">
  <meta property="article:tag" content="Python">
  <meta property="article:tag" content="Spider">
  <meta property="article:tag" content="爬虫">
  <meta property="article:tag" content="Web">
  <meta property="article:tag" content="Kubernetes">
  <meta property="article:tag" content="深度学习">
  <meta property="article:tag" content="机器学习">
  <meta property="article:tag" content="数据分析">
  <meta property="article:tag" content="网络">
  <meta property="article:tag" content="IT">
  <meta property="article:tag" content="技术">
  <meta property="article:tag" content="博客">
  <meta name="twitter:card" content="summary">
  <link rel="canonical" href="https://cuiqingcai.com/page/12/">
  <script id="page-configurations">
    // https://hexo.io/docs/variables.html
    CONFIG.page = {
      sidebar: "",
      isHome: true,
      isPost: false,
      lang: 'zh-CN'
    };

  </script>
  <title>静觅丨崔庆才的个人站点</title>
  <meta name="google-site-verification" content="p_bIcnvirkFzG2dYKuNDivKD8-STet5W7D-01woA2fc" />
  <noscript>
    <style>
      .use-motion .brand,
      .use-motion .menu-item,
      .sidebar-inner,
      .use-motion .post-block,
      .use-motion .pagination,
      .use-motion .comments,
      .use-motion .post-header,
      .use-motion .post-body,
      .use-motion .collection-header
      {
        opacity: initial;
      }

      .use-motion .site-title,
      .use-motion .site-subtitle
      {
        opacity: initial;
        top: initial;
      }

      .use-motion .logo-line-before i
      {
        left: initial;
      }

      .use-motion .logo-line-after i
      {
        right: initial;
      }

    </style>
  </noscript>
  <link rel="alternate" href="/atom.xml" title="静觅" type="application/atom+xml">
</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container">
    <div class="headband"></div>
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner">
        <div class="site-brand-container">
          <div class="site-nav-toggle">
            <div class="toggle" aria-label="切换导航栏">
              <span class="toggle-line toggle-line-first"></span>
              <span class="toggle-line toggle-line-middle"></span>
              <span class="toggle-line toggle-line-last"></span>
            </div>
          </div>
          <div class="site-meta">
            <a href="/" class="brand" rel="start">
              <span class="logo-line-before"><i></i></span>
              <h1 class="site-title">静觅 <span class="site-subtitle"> 崔庆才的个人站点 </span>
              </h1>
              <span class="logo-line-after"><i></i></span>
            </a>
          </div>
          <div class="site-nav-right">
            <div class="toggle popup-trigger">
              <i class="fa fa-search fa-fw fa-lg"></i>
            </div>
          </div>
        </div>
        <nav class="site-nav">
          <ul id="menu" class="main-menu menu">
            <li class="menu-item menu-item-home">
              <a href="/" rel="section">首页</a>
            </li>
            <li class="menu-item menu-item-archives">
              <a href="/archives/" rel="section">文章列表</a>
            </li>
            <li class="menu-item menu-item-tags">
              <a href="/tags/" rel="section">文章标签</a>
            </li>
            <li class="menu-item menu-item-categories">
              <a href="/categories/" rel="section">文章分类</a>
            </li>
            <li class="menu-item menu-item-about">
              <a href="/about/" rel="section">关于博主</a>
            </li>
            <li class="menu-item menu-item-message">
              <a href="/message/" rel="section">给我留言</a>
            </li>
            <li class="menu-item menu-item-search">
              <a role="button" class="popup-trigger">搜索 </a>
            </li>
          </ul>
        </nav>
        <div class="search-pop-overlay">
          <div class="popup search-popup">
            <div class="search-header">
              <span class="search-icon">
                <i class="fa fa-search"></i>
              </span>
              <div class="search-input-container">
                <input autocomplete="off" autocapitalize="off" placeholder="搜索..." spellcheck="false" type="search" class="search-input">
              </div>
              <span class="popup-btn-close">
                <i class="fa fa-times-circle"></i>
              </span>
            </div>
            <div id="search-result">
              <div id="no-result">
                <i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>
              </div>
            </div>
          </div>
        </div>
      </div>
    </header>
    <div class="back-to-top">
      <i class="fa fa-arrow-up"></i>
      <span>0%</span>
    </div>
    <div class="reading-progress-bar"></div>
    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div class="content index posts-expand">
            <div class="carousel">
              <div id="wowslider-container">
                <div class="ws_images">
                  <ul>
                    <li><a target="_blank" href="https://cuiqingcai.com/5052.html"><img title="Python3网络爬虫开发实战教程" src="https://qiniu.cuiqingcai.com/ipy96.jpg" /></a></li>
                    <li><a target="_blank" href="https://t.lagou.com/fRCBRsRCSN6FA"><img title="52讲轻松搞定网络爬虫" src="https://qiniu.cuiqingcai.com/fqq5e.png" /></a></li>
                    <li><a target="_blank" href="https://brightdata.grsm.io/cuiqingcai"><img title="亮网络解锁器" src="https://qiniu.cuiqingcai.com/6qnb7.png" /></a></li>
                    <li><a target="_blank" href="https://cuiqingcai.com/4320.html"><img title="Python3网络爬虫开发视频教程" src="https://qiniu.cuiqingcai.com/bjrny.jpg" /></a></li>
                    <li><a target="_blank" href="https://cuiqingcai.com/5094.html"><img title="爬虫代理哪家强？十大付费代理详细对比评测出炉！" src="https://qiniu.cuiqingcai.com/nifs6.jpg" /></a></li>
                  </ul>
                </div>
                <div class="ws_thumbs">
                  <div>
                    <a target="_blank" href="#"><img src="https://qiniu.cuiqingcai.com/ipy96.jpg" /></a>
                    <a target="_blank" href="#"><img src="https://qiniu.cuiqingcai.com/fqq5e.png" /></a>
                    <a target="_blank" href="#"><img src="https://qiniu.cuiqingcai.com/6qnb7.png" /></a>
                    <a target="_blank" href="#"><img src="https://qiniu.cuiqingcai.com/bjrny.jpg" /></a>
                    <a target="_blank" href="#"><img src="https://qiniu.cuiqingcai.com/nifs6.jpg" /></a>
                  </div>
                </div>
                <div class="ws_shadow"></div>
              </div>
            </div>
            <link rel="stylesheet" href="/lib/wowslide/slide.css">
            <script src="/lib/wowslide/jquery.min.js"></script>
            <script src="/lib/wowslide/slider.js"></script>
            <script>
              jQuery("#wowslider-container").wowSlider(
              {
                effect: "cube",
                prev: "",
                next: "",
                duration: 20 * 100,
                delay: 20 * 100,
                width: 716,
                height: 297,
                autoPlay: true,
                playPause: true,
                stopOnHover: false,
                loop: false,
                bullets: 0,
                caption: true,
                captionEffect: "slide",
                controls: true,
                onBeforeStep: 0,
                images: 0
              });

            </script>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7712.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> JavaScript <i class="label-arrow"></i>
                  </a>
                  <a href="/7712.html" class="post-title-link" itemprop="url">书籍《Python3 反爬虫原理与绕过实战》详细目录和最新消息</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>在夜幕读者群和算法反爬虫群的朋友都知道，我的新书《Python3 反爬虫原理与绕过实战》很快就要印刷出版了。 出版社的小姐姐们为本书设计了很多款封面 <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122456.png" alt=""></p>
                  <blockquote>
                    <p>但目前暂未选定封面</p>
                  </blockquote>
                  <p>之前我也有放出大章目录和配套代码，但详细目录和最新进展一直没机会公开。配套代码放在 <a href="https://github.com/asyncins/antispider" target="_blank" rel="noopener">GitHub 仓库</a>，大章目录也在。 <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122620.jpg" alt=""> 这次将详细目录呈现给大家。请大家先阅读《Python3 反爬虫原理与绕过实战》的内容提要</p>
                  <blockquote>
                    <p>本书描述了爬虫技术与反爬虫技术的对抗过程，并详细介绍了这其中的原理和具体实现方法。首先讲 解开发环境的配置、Web 网站的构成、页面渲染以及动态网页和静态网页对爬虫造成的影响。然后介绍了 不同类型的反爬虫原理、具体实现和绕过方法，另外还涉及常见验证码的实现过程，并使用深度学习技术 完成了验证。最后介绍了常见的编码和加密原理、JavaScript 代码混淆知识、前端禁止事件以及与爬虫相 关的法律知识和风险点。 本书既适合需要储备反爬虫知识的前端工程师和后端工程师，也适合需要储备绕过知识的爬虫工程师、 爬虫爱好者以及 Python 程序员。</p>
                  </blockquote>
                  <h1 id="作者是谁"><a href="#作者是谁" class="headerlink" title="作者是谁"></a>作者是谁</h1>
                  <p>这本书谁写的？靠不靠谱呢？ 这个靓仔就是我，韦世东。 <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122657.jpeg" alt=""> 作者韦世东是资深爬虫工程师，2019年华为云认证云享专家、掘金社区优秀作者、GitChat认证作者、搜狐产品技术约稿作者、夜幕团队成员。拥有七年互联网从业经验，擅长反爬虫的设计和绕过技巧。</p>
                  <h1 id="详细目录"><a href="#详细目录" class="headerlink" title="详细目录"></a>详细目录</h1>
                  <p>以下放出的章节目录为改版前的目录，大部分章和节都配套实战环节，实际上新版目录与这里的略有差异。 <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122853.jpg" alt=""> <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122912.jpg" alt=""> <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122926.jpg" alt=""> <img src="http://q0revehsm.bkt.clouddn.com/sfhfpc/20191118122946.jpg" alt=""></p>
                  <h1 id="什么时候可以买到？"><a href="#什么时候可以买到？" class="headerlink" title="什么时候可以买到？"></a>什么时候可以买到？</h1>
                  <p>审核、校对和排版工作早已进行，按照正常流程来说月底送印，双十一之前会在各大在线书城（如京东、当当等）跟大家见面。 同时也会开启直播送书、抽奖送书等活动。想要参与活动的朋友可以添加我好友，微信号：Domfreez。加好友进群以获得书籍和活动的最新消息。欢迎大家保持对《Python3 反爬虫原理与绕过实战》的关注，新书发布后会有很多活动回馈给大家!</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/韦世东学算法和反爬虫" class="author" itemprop="url" rel="index">韦世东学算法和反爬虫</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-10-10 17:57:43" itemprop="dateCreated datePublished" datetime="2019-10-10T17:57:43+08:00">2019-10-10</time>
                </span>
                <span id="/7712.html" class="post-meta-item leancloud_visitors" data-flag-title="书籍《Python3 反爬虫原理与绕过实战》详细目录和最新消息" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>796</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>1 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7701.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7701.html" class="post-title-link" itemprop="url">如何用 nativefier 将网页打包成客户端 App</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>大家有没有一种感觉，很多网站其实做得非常优秀，但是它们就是没有开发 PC （电脑）版的客户端，比如知乎、GitHub、微信公众号。 如果我们大多数时间都是使用 PC 开发或者办公的，每次开始时我们都需要打开浏览器输入它们的网址，进入对应的页面。另外一个浏览器中我们可能会开各种各样的选项卡，少则两三个，多则一二十个，这就导致某些我们常用的甚至重度依赖的网站在切换的时候就会不怎么方便。 比如挤在一堆浏览器里面的 GitHub，选项卡已经被挤得看不全了： <img src="https://qiniu.cuiqingcai.com/2019-10-09-132628.png" alt="image-20191009212626789"> 这时候，如果我们能有一个客户端，即 Window 上的 exe 程序或 Mac 上的 app 应用程序，它们的名字就叫做 GitHub、微信公众平台等等，打开之后只单独负责呈现 GitHub、微信公众号的内容，我们就可以免去在浏览器中来回寻找站点和切换站点的麻烦。 甚至说，在 Windows 上我们可以直接把这个应用放在桌面或把它 Pin 到任务栏上， Mac 上我们可以直接将它固定到 Dock 栏上，这样一键就打开了，省时省力。如果使用了快捷启动软件，比如 Wox （Windows）或 Alfred（Mac），直接输入 GitHub 或者微信公众平台，那就更方便唤出了，简直不要太方便。 而且，我个人感觉，用客户端软件比用网页更有一种「踏实感」，不知道大家会不会也有这种感觉。 所以，如果能将这些常用的或者重度依赖的网站转成客户端软件，那就再方便不过了。 比如我用的是 Mac，把 GitHub 转成客户端软件之后，我习惯性用 Alfred 呼出： <img src="https://qiniu.cuiqingcai.com/2019-10-09-134000.png" alt="image-20191009213959316"> 然后就打开了一个 GitHub.app： <img src="https://qiniu.cuiqingcai.com/2019-10-09-134127.png" alt="image-20191009214125953"> 然后把它固定到 Dock 栏上： <img src="https://qiniu.cuiqingcai.com/2019-10-09-134257.png" alt="image-20191009214254670"> 就仿佛拥有了一个 GitHub 的客户端，功能与网页一模一样，再也不用在浏览器里面切来切去。而且也不用担心版本更新的问题，因为它就是开了一个独立的网页，网页改版或者更新，内容就随着更新。 是不是很方便呢？ 如果你觉得是，那就随着我来了解一下怎样实现吧。</p>
                  <h2 id="nativefier"><a href="#nativefier" class="headerlink" title="nativefier"></a>nativefier</h2>
                  <p>这里需要用到的一个工具，名字叫做 nativefier，是基于 electron 开发的，它的功能就是把任意的网页转成一个电脑客户端，即 Desktop Application， 有了这个软件，把网页转成电脑客户端只需要这么一条简单的命令：</p>
                  <figure class="highlight xml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier <span class="tag">&lt;<span class="name">website</span>&gt;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>比如把 Whatsapp 的网站打包成一个客户端就只需要执行这样的命令：</p>
                  <figure class="highlight css">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-tag">nativefier</span> <span class="selector-tag">web</span><span class="selector-class">.whatsapp</span><span class="selector-class">.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>示意如下： <img src="https://qiniu.cuiqingcai.com/2019-10-09-135019.gif" alt="Walkthrough"> 怎样，不论是什么网页，就可以使用它来转换成一个客户端软件。 另外它支持三大操作系统，Windows、Linux、Mac，即用它可以将网页转成 <code>.exe</code>、<code>.app</code> 等格式。</p>
                  <h2 id="安装"><a href="#安装" class="headerlink" title="安装"></a>安装</h2>
                  <p>那么这软件究竟具体怎么来使用呢，第一步当然就是安装了。 由于 nativefier 是基于 electron 开发的，而后者又是基于 Node.js 的，所以要使用它必须要安装 Node.js，建议安装 6.0 以上版本。 另外在 Linux 和 Mac 平台可能需要安装其他的依赖。</p>
                  <ul>
                    <li>在 Linux 上需要安装 Wine 并配置好环境变量。</li>
                    <li>在 Mac 上需要安装 iconutil、imagemagick，这两个依赖是为了帮助程序处理 App 的 icon 的。</li>
                  </ul>
                  <p>具体的安装说明可以参见：<a href="https://github.com/jiahaog/nativefier#optional-dependencies" target="_blank" rel="noopener">https://github.com/jiahaog/nativefier#optional-dependencies</a>。 以上步骤完成之后，使用 npm 安装 nativefier 即可：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm <span class="keyword">install</span> nativefier -g</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装完毕之后便可以使用 nativefier 命令了。</p>
                  <h2 id="使用"><a href="#使用" class="headerlink" title="使用"></a>使用</h2>
                  <p>下面我在 Mac 下以 GitHub 为例来介绍下怎样将 GitHub 打包成一个客户端软件。 像刚才介绍的一样，最简单直接的，运行下面的命令就好了：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier <span class="string">https:</span><span class="comment">//github.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>它会尝试用 GitHub 主页的 title 来命名这个客户端，而 GitHub 的 title 比较长，叫做：</p>
                  <figure class="highlight delphi">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">The worlds leading software development <span class="keyword">platform</span>  GitHub</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>所以它会生成这样的一个客户端软件： <img src="https://qiniu.cuiqingcai.com/2019-10-09-140452.png" alt="image-20191009220450996"> 这个名字有点奇怪，我们可以使用命令的一个选项即可控制生成的客户端的名称，添加一个 name 参数即可：</p>
                  <figure class="highlight delphi">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier --<span class="keyword">name</span> GitHub https:<span class="comment">//github.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样便会生成一个名为 GitHub 的客户端： <img src="https://qiniu.cuiqingcai.com/2019-10-09-140719.png" alt="image-20191009220717549"> 另外我们可以看到客户端的图标也自动生成了，这个图标怎么来的呢？这个是用的 nativefier 维护的 icons，恰好 GitHub 在它们的收录范围内，所以就用上了。这些 icons 也是一个公开的 Repository，链接为： <a href="https://github.com/jiahaog/nativefier-icons，大家可以到这里搜集或者贡献图标" target="_blank" rel="noopener">https://github.com/jiahaog/nativefier-icons，大家可以到这里搜集或者贡献图标</a>。 如果我们觉得 nativefier 官方提供的图标不好看，想要自定义图标的话，也是可以的，只需要添加一个 icon 参数即可，这样便可以指定本地图片作为图标来生成了。 但值得注意的是，不同平台上要求的图标格式不一样。</p>
                  <ul>
                    <li>Windows 上需要 ico 格式。</li>
                    <li>Linux 上需要 png 格式。</li>
                    <li>Mac 上需要 icns 格式，如果安装了上文所需要的依赖，使用 png 格式也是可以的。</li>
                  </ul>
                  <p>具体的参数用法说明可以看：<a href="https://github.com/jiahaog/nativefier/blob/master/docs/api.md#icon" target="_blank" rel="noopener">https://github.com/jiahaog/nativefier/blob/master/docs/api.md#icon</a>。 好，那么在 Mac 上我安装了依赖，那就直接用 png 格式的图标了。 在这里我自己做了一个圆形的图标如下，命名为 github.png： <img src="https://qiniu.cuiqingcai.com/2019-10-09-143323.png" alt="2019-10-09-141852"> 然后把图片使用下面的命令就可以自定义图标了：</p>
                  <figure class="highlight jboss-cli">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier <span class="params">--name</span> GitHub <span class="params">--icon</span> <span class="string">./github.png</span> https:<span class="string">//github.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样就能生成自定义图标的客户端软件了。 打开之后，登录，我们就拥有了一个 GitHub 客户端了，界面和网页一模一样，但是已经摆脱了混杂选项卡的干扰，示意如下： <img src="https://qiniu.cuiqingcai.com/2019-10-09-143009.png" alt="image-20191009223006991"> 好了，这就是基本的用法，其实大部分情况只需要这几个参数就够了，如果想了解功能大家可以参考官方的 API 文档：<a href="https://github.com/jiahaog/nativefier/blob/master/docs/api.md#api" target="_blank" rel="noopener">https://github.com/jiahaog/nativefier/blob/master/docs/api.md#api</a>。 如果想要生成其他的客户端，如微信公众平台、知乎等等都是可以的。 如微信公众平台就是这样的： <img src="https://qiniu.cuiqingcai.com/2019-10-09-142258.png" alt="image-20191009222257275"></p>
                  <h2 id="注意"><a href="#注意" class="headerlink" title="注意"></a>注意</h2>
                  <p>在使用过程中我发现 name 参数对中文的支持并不好，总会生成一个 APP 的客户端，在这里推荐 name 使用英文名称，比如知乎用 Zhihu，微信平台用 WXMP 等等。 例如命令：</p>
                  <figure class="highlight jboss-cli">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier <span class="params">--name</span> 知乎 <span class="params">--icon</span> <span class="string">./zhihu.png</span> https:<span class="string">//www.zhihu.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以用下面的命令代替：</p>
                  <figure class="highlight jboss-cli">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">nativefier <span class="params">--name</span> Zhihu <span class="params">--icon</span> <span class="string">./zhihu.png</span> https:<span class="string">//www.zhihu.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>生成客户端软件知乎再手动修改下图标的名称即可。 另外生成的客户端软件是不支持插件的，如果你的站点对某些插件的依赖比较强，那就不建议使用 nativefier 转成的客户端了。 好了，这就是 nativefier 的基本用法，有了它我们就可以随意地将网页转成客户端软件了，快来试试吧！</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-10-10 10:02:13" itemprop="dateCreated datePublished" datetime="2019-10-10T10:02:13+08:00">2019-10-10</time>
                </span>
                <span id="/7701.html" class="post-meta-item leancloud_visitors" data-flag-title="如何用 nativefier 将网页打包成客户端 App" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>2.9k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>3 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7625.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7625.html" class="post-title-link" itemprop="url">利用 GitHub + Hexo + Next 从零搭建一个博客</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>趁着周末，搭建了一下 NightTeam 的官方博客和官方主页，耗时数个小时，两个站点终于完工了。 由于 NightTeam 的域名是 nightteam.cn，所以这里官方博客使用了二级域名 blog.nightteam.cn，官方主页使用了根域名 nightteam.cn，现在两个站点都已经稳定运行在 GitHub Pages 上面了，大家如果感兴趣可以去看一下。</p>
                  <ul>
                    <li>NightTeam HomePage：<a href="https://nightteam.cn/" target="_blank" rel="noopener">https://nightteam.cn/</a></li>
                    <li>NightTeam Blog: <a href="https://blog.nightteam.cn/" target="_blank" rel="noopener">https://blog.nightteam.cn/</a></li>
                  </ul>
                  <p>这里的主页就是用一个基本的静态页面搭建了，没有什么技术含量。博客相对复杂一点，使用了 Hexo 框架，采用了 Next 主题，在搭建的过程中我就顺手把搭建的流程大致记录下来了，在这里扩充一下形成一篇记录，毕竟好记性不如烂笔头。 <img src="https://qiniu.cuiqingcai.com/2019-09-26-114318.png" alt=""> <img src="https://qiniu.cuiqingcai.com/2019-09-26-114346.png" alt=""> 于是，这篇《利用 GitHub 从零开始搭建一个博客》的文章就诞生了。</p>
                  <h2 id="准备条件"><a href="#准备条件" class="headerlink" title="准备条件"></a>准备条件</h2>
                  <p>在这里先跟大家说一些准备条件，有些同学可能一听到搭建博客就望而却步。弄个博客网站，不得有台服务器吗？不得搞数据库吗？不得注册域名吗？没事，如果都没有，那照样是能搭建一个博客的。 GitHub 是个好东西啊，它提供了 GitHub Pages 帮助我们来架设一个静态网站，这就解决了服务器的问题。 Hexo 这个博客框架没有那么重量级，它是 MarkDown 直接写文章的，然后 Hexo 可以直接将文章编译成静态网页文件并发布，所以这样文章的内容、标题、标签等信息就没必要存数据库里面了，是直接纯静态页面了，这就解决了数据库的问题。 GitHub Pages 允许每个账户创建一个名为 {username}.github.io 的仓库，另外它还会自动为这个仓库分配一个 github.io 的二级域名，这就解决了域名的问题，当然如果想要自定义域名的话，也可以支持。 所以说，基本上，先注册个 GitHub 账号就能搞了，下面我们来正式开始吧。</p>
                  <h2 id="新建项目"><a href="#新建项目" class="headerlink" title="新建项目"></a>新建项目</h2>
                  <p>首先在 GitHub 新建一个仓库（Repository），名称为 {username}.github.io，注意这个名比较特殊，必须要是 github.io 为后缀结尾的。比如 NightTeam 的 GitHub 用户名就叫 NightTeam，那我就新建一个 nightteam.github.io，新建完成之后就可以进行后续操作了。 另外如果 GitHub 没有配置 SSH 连接的建议配置一下，这样后面在部署博客的时候会更方便。</p>
                  <h2 id="安装环境"><a href="#安装环境" class="headerlink" title="安装环境"></a>安装环境</h2>
                  <h3 id="安装-Node-js"><a href="#安装-Node-js" class="headerlink" title="安装 Node.js"></a>安装 Node.js</h3>
                  <p>首先在自己的电脑上安装 Node.js，下载地址：<a href="https://nodejs.org/zh-cn/download/" target="_blank" rel="noopener">https://nodejs.org/zh-cn/download/</a>，可以安装 Stable 版本。 安装完毕之后，确保环境变量配置好，能正常使用 <code>npm</code> 命令。</p>
                  <h3 id="安装-Hexo"><a href="#安装-Hexo" class="headerlink" title="安装 Hexo"></a>安装 Hexo</h3>
                  <p>接下来就需要安装 Hexo 了，这是一个博客框架，Hexo 官方还提供了一个命令行工具，用于快速创建项目、页面、编译、部署 Hexo 博客，所以在这之前我们需要先安装 Hexo 的命令行工具。 命令如下：</p>
                  <figure class="highlight avrasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm install -g hexo-<span class="keyword">cli</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装完毕之后，确保环境变量配置好，能正常使用 <code>hexo</code> 命令。</p>
                  <h2 id="初始化项目"><a href="#初始化项目" class="headerlink" title="初始化项目"></a>初始化项目</h2>
                  <p>接下来我们使用 Hexo 的命令行创建一个项目，并将其在本地跑起来，整体跑通看看。 首先使用如下命令创建项目：</p>
                  <figure class="highlight applescript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo init &#123;<span class="built_in">name</span>&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里的 name 就是项目名，我这里要创建 NightTeam 的博客，我就把项目取名为 nightteam 了，用了纯小写，命令如下：</p>
                  <figure class="highlight ebnf">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attribute">hexo init nightteam</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样 nightteam 文件夹下就会出现 Hexo 的初始化文件，包括 themes、scaffolds、source 等文件夹，这些内容暂且先不用管是做什么的，我们先知道有什么，然后一步步走下去看看都发生了什么变化。 接下来我们首先进入新生成的文件夹里面，然后调用 Hexo 的 generate 命令，将 Hexo 编译生成 HTML 代码，命令如下：</p>
                  <figure class="highlight verilog">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo <span class="keyword">generate</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到输出结果里面包含了 js、css、font 等内容，并发现他们都处在了项目根目录下的 public 文件夹下面了。 然后我们利用 Hexo 提供的 server 命令把博客在本地运行起来，命令如下：</p>
                  <figure class="highlight axapta">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo <span class="keyword">server</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行之后命令行输出如下：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">INFO</span>  <span class="keyword">Start</span> processing</span><br><span class="line"><span class="keyword">INFO</span>  Hexo <span class="keyword">is</span> running at http://localhost:<span class="number">4000</span> . Press Ctrl+C <span class="keyword">to</span> stop.</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>它告诉我们在本地 4000 端口上就可以查看博客站点了，如图所示： <img src="https://qiniu.cuiqingcai.com/2019-09-20-064939.png" alt=""> 这样一个博客的架子就出来了，我们只用了三个命令就完成了。</p>
                  <h2 id="部署"><a href="#部署" class="headerlink" title="部署"></a>部署</h2>
                  <p>接下来我们来将这个初始化的博客进行一下部署，放到 GitHub Pages 上面验证一下其可用性。成功之后我们可以再进行后续的修改，比如修改主题、修改页面配置等等。 那么怎么把这个页面部署到 GitHub Pages 上面呢，其实 Hexo 已经给我们提供一个命令，利用它我们可以直接将博客一键部署，不需要手动去配置服务器或进行其他的各项配置。 部署命令如下：</p>
                  <figure class="highlight ebnf">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attribute">hexo deploy</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在部署之前，我们需要先知道博客的部署地址，它需要对应 GitHub 的一个 Repository 的地址，这个信息需要我们来配置一下。 打开根目录下的 _config.yml 文件，找到 Deployment 这个地方，把刚才新建的 Repository 的地址贴过来，然后指定分支为 master 分支，最终修改为如下内容：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="meta"># Deployment</span></span><br><span class="line"><span class="meta">## Docs: https:<span class="comment">//hexo.io/docs/deployment.html</span></span></span><br><span class="line"><span class="symbol">deploy:</span></span><br><span class="line"><span class="symbol">  type:</span> git</span><br><span class="line"><span class="symbol">  repo:</span> &#123;git repo ssh address&#125;</span><br><span class="line"><span class="symbol">  branch:</span> master</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>我的就修改为如下内容：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="meta"># Deployment</span></span><br><span class="line"><span class="meta">## Docs: https:<span class="comment">//hexo.io/docs/deployment.html</span></span></span><br><span class="line"><span class="symbol">deploy:</span></span><br><span class="line"><span class="symbol">  type:</span> git</span><br><span class="line"><span class="symbol">  repo:</span> git@github.com:NightTeam/nightteam.github.io.git</span><br><span class="line"><span class="symbol">  branch:</span> master</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>另外我们还需要额外安装一个支持 Git 的部署插件，名字叫做 hexo-deployer-git，有了它我们才可以顺利将其部署到 GitHub 上面，如果不安装的话，在执行部署命令时会报如下错误：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Deployer not <span class="string">found:</span> git</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>好，那就让我们安装下这个插件，在项目目录下执行安装命令如下：</p>
                  <figure class="highlight sql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm <span class="keyword">install</span> hexo-deployer-git <span class="comment">--save</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装成功之后，执行部署命令：</p>
                  <figure class="highlight ebnf">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attribute">hexo deploy</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果类似如下：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">INFO  Deploying: git</span><br><span class="line">INFO  Clearing .deploy_git folder...</span><br><span class="line">INFO  Copying files <span class="keyword">from</span> <span class="keyword">public</span> folder...</span><br><span class="line">INFO  Copying files <span class="keyword">from</span> extend dirs...</span><br><span class="line">On branch master</span><br><span class="line">nothing to commit, working directory clean</span><br><span class="line">Counting objects: <span class="number">46</span>, done.</span><br><span class="line">Delta compression using up to <span class="number">8</span> threads.</span><br><span class="line">Compressing objects: <span class="number">100</span>% (<span class="number">36</span>/<span class="number">36</span>), done.</span><br><span class="line">Writing objects: <span class="number">100</span>% (<span class="number">46</span>/<span class="number">46</span>), <span class="number">507.66</span> KiB | <span class="number">0</span> bytes/s, done.</span><br><span class="line">Total <span class="number">46</span> (delta <span class="number">3</span>), reused <span class="number">0</span> (delta <span class="number">0</span>)</span><br><span class="line">remote: Resolving deltas: <span class="number">100</span>% (<span class="number">3</span>/<span class="number">3</span>), done.</span><br><span class="line">To <span class="symbol">git@</span>github.com:NightTeam/nightteam.github.io.git</span><br><span class="line"> * [new branch]      HEAD -&gt; master</span><br><span class="line">Branch master <span class="keyword">set</span> up to track remote branch master <span class="keyword">from</span> <span class="symbol">git@</span>github.com:NightTeam/nightteam.github.io.git.</span><br><span class="line">INFO  Deploy done: git</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>如果出现类似上面的内容，就证明我们的博客已经成功部署到 GitHub Pages 上面了，这时候我们访问一下 GitHub Repository 同名的链接，比如我的 NightTeam 博客的 Repository 名称取的是 nightteam.github.io，那我就访问 <a href="http://nightteam.github.io" target="_blank" rel="noopener">http://nightteam.github.io</a>，这时候我们就可以看到跟本地一模一样的博客内容了。 <img src="https://qiniu.cuiqingcai.com/2019-09-20-062934.png" alt=""> 这时候我们去 GitHub 上面看看 Hexo 上传了什么内容，打开之后可以看到 master 分支有了这样的内容： <img src="https://qiniu.cuiqingcai.com/2019-09-26-112229.png" alt=""> 仔细看看，这实际上是博客文件夹下面的 public 文件夹下的所有内容，Hexo 把编译之后的静态页面内容上传到 GitHub 的 master 分支上面去了。 这时候可能就有人有疑问了，那我博客的源码也想放到 GitHub 上面怎么办呢？其实很简单，新建一个其他的分支就好了，比如我这边就新建了一个 source 分支，代表博客源码的意思。 具体的添加过程就很简单了，参加如下命令：</p>
                  <figure class="highlight properties">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">git</span> <span class="string">init</span></span><br><span class="line"><span class="attr">git</span> <span class="string">checkout -b source</span></span><br><span class="line"><span class="attr">git</span> <span class="string">add -A</span></span><br><span class="line"><span class="attr">git</span> <span class="string">commit -m "init blog"</span></span><br><span class="line"><span class="attr">git</span> <span class="string">remote add origin git@github.com:&#123;username&#125;/&#123;username&#125;.github.io.git</span></span><br><span class="line"><span class="attr">git</span> <span class="string">push origin source</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>成功之后，可以到 GitHub 上再切换下默认分支，比如我就把默认的分支设置为了 source，当然不换也可以。</p>
                  <h2 id="配置站点信息"><a href="#配置站点信息" class="headerlink" title="配置站点信息"></a>配置站点信息</h2>
                  <p>完成如上内容之后，实际上我们只完成了博客搭建的一小步，因为我们仅仅是把初始化的页面部署成功了，博客里面还没有设置任何有效的信息。下面就让我们来进行一下博客的基本配置，另外换一个好看的主题，配置一些其他的内容，让博客真正变成属于我们自己的博客吧。 下面我就以自己的站点 NightTeam 为例，修改一些基本的配置，比如站点名、站点描述等等。 修改根目录下的 _config.yml 文件，找到 Site 区域，这里面可以配置站点标题 title、副标题 subtitle 等内容、关键字 keywords 等内容，比如我的就修改为如下内容：</p>
                  <figure class="highlight avrasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="meta"># Site</span></span><br><span class="line"><span class="symbol">title:</span> NightTeam</span><br><span class="line"><span class="symbol">subtitle:</span> 一个专注技术的组织</span><br><span class="line"><span class="symbol">description:</span> 涉猎的主要编程语言为 Python、Rust、C++、Go，领域涵盖爬虫、深度学习、服务研发和对象存储等。</span><br><span class="line"><span class="symbol">keywords:</span> <span class="string">"Python, Rust, C++, Go, 爬虫, 深度学习, 服务研发, 对象存储"</span></span><br><span class="line"><span class="symbol">author:</span> NightTeam</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里大家可以参照格式把内容改成自己的。 另外还可以设置一下语言，如果要设置为汉语的话可以将 language 的字段设置为 zh-CN，修改如下：</p>
                  <figure class="highlight avrasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">language:</span> <span class="built_in">zh</span>-CN</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样就完成了站点基本信息的配置，完成之后可以看到一些基本信息就修改过来了，页面效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-032710.png" alt=""></p>
                  <h2 id="修改主题"><a href="#修改主题" class="headerlink" title="修改主题"></a>修改主题</h2>
                  <p>目前来看，整个页面的样式个人感觉并不是那么好看，想换一个风格，这就涉及到主题的配置了。目前 Hexo 里面应用最多的主题基本就是 Next 主题了，个人感觉这个主题还是挺好看的，另外它支持的插件和功能也极为丰富，配置了这个主题，我们的博客可以支持更多的扩展功能，比如阅览进度条、中英文空格排版、图片懒加载等等。 那么首先就让我们来安装下 Next 这个主题吧，目前 Next 主题已经更新到 7.x 版本了，我们可以直接到 Next 主题的 GitHub Repository 上把这个主题下载下来。 主题的 GitHub 地址是：<a href="https://github.com/theme-next/hexo-theme-next，我们可以直接把" target="_blank" rel="noopener">https://github.com/theme-next/hexo-theme-next，我们可以直接把</a> master 分支 Clone 下来。 首先命令行进入到项目的根目录，执行如下命令即可：</p>
                  <figure class="highlight vim">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">git clone http<span class="variable">s:</span>//github.<span class="keyword">com</span>/theme-<span class="keyword">next</span>/hexo-theme-<span class="keyword">next</span> themes/<span class="keyword">next</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>执行完毕之后 Next 主题的源码就会出现在项目的 themes/next 文件夹下。 然后我们需要修改下博客所用的主题名称，修改项目根目录下的 _config.yml 文件，找到 theme 字段，修改为 next 即可，修改如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">theme: <span class="keyword">next</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后本地重新开启服务，访问刷新下页面，就可以看到 next 主题就切换成功了，预览效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-034504.png" alt=""></p>
                  <h2 id="主题配置"><a href="#主题配置" class="headerlink" title="主题配置"></a>主题配置</h2>
                  <p>现在我们已经成功切换到 next 主题上面了，接下来我们就对主题进行进一步地详细配置吧，比如修改样式、增加其他各项功能的支持，下面逐项道来。 Next 主题内部也提供了一个配置文件，名字同样叫做 _config.yml，只不过位置不一样，它在 themes/next 文件夹下，Next 主题里面所有的功能都可以通过这个配置文件来控制，下文所述的内容都是修改的 themes/next/_config.yml 文件。</p>
                  <h3 id="样式"><a href="#样式" class="headerlink" title="样式"></a>样式</h3>
                  <p>Next 主题还提供了多种样式，风格都是类似黑白的搭配，但整个布局位置不太一样，通过修改配置文件的 scheme 字段即可，我选了 Pisces 样式，修改 _config.yml （注意是 themes/next/_config.yml 文件）如下：</p>
                  <figure class="highlight avrasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">scheme:</span> Pisces</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>刷新页面之后就会变成这种样式，如图所示： <img src="https://qiniu.cuiqingcai.com/2019-09-26-034750.png" alt=""> 另外还有几个可选项，比如：</p>
                  <figure class="highlight vala">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="meta"># scheme: Muse</span></span><br><span class="line"><span class="meta">#scheme: Mist</span></span><br><span class="line">scheme: Pisces</span><br><span class="line"><span class="meta">#scheme: Gemini</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>大家可以自行根据喜好选择。</p>
                  <h3 id="favicon"><a href="#favicon" class="headerlink" title="favicon"></a>favicon</h3>
                  <p>favicon 就是站点标签栏的小图标，默认是用的 Hexo 的小图标，如果我们有站点 Logo 的图片的话，我们可以自己定制小图标。 但这并不意味着我们需要自己用 PS 自己来设计，已经有一个网站可以直接将图片转化为站点小图标，站点链接为：<a href="https://realfavicongenerator.net" target="_blank" rel="noopener">https://realfavicongenerator.net</a>，到这里上传一张图，便可以直接打包下载各种尺寸和适配不同设备的小图标。 图标下载下来之后把它放在 themes/next/source/images 目录下面。 然后在配置文件里面找到 favicon 配置项，把一些相关路径配置进去即可，示例如下：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">favicon:</span></span><br><span class="line"><span class="symbol">  small:</span> <span class="meta-keyword">/images/</span>favicon<span class="number">-16</span>x16.png</span><br><span class="line"><span class="symbol">  medium:</span> <span class="meta-keyword">/images/</span>favicon<span class="number">-32</span>x32.png</span><br><span class="line"><span class="symbol">  apple_touch_icon:</span> <span class="meta-keyword">/images/</span>apple-touch-icon.png</span><br><span class="line"><span class="symbol">  safari_pinned_tab:</span> <span class="meta-keyword">/images/</span>safari-pinned-tab.svg</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>配置完成之后刷新页面，整个页面的标签图标就被更新了。</p>
                  <h3 id="avatar"><a href="#avatar" class="headerlink" title="avatar"></a>avatar</h3>
                  <p>avatar 这个就类似站点的头像，如果设置了这个，会在站点的作者信息旁边额外显示一个头像，比如我这边有一张 avatar.png 图片： <img src="https://qiniu.cuiqingcai.com/2019-09-26-035351.png" alt=""> 将其放置到 themes/next/source/images/avatar.png 路径，然后在主题 _config.yml 文件下编辑 avatar 的配置，修改为正确的路径即可。</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># Sidebar Avatar</span></span><br><span class="line"><span class="attr">avatar:</span></span><br><span class="line">  <span class="comment"># In theme directory (source/images): /images/avatar.gif</span></span><br><span class="line">  <span class="comment"># In site directory (source/uploads): /uploads/avatar.gif</span></span><br><span class="line">  <span class="comment"># You can also use other linking images.</span></span><br><span class="line">  <span class="attr">url:</span> <span class="string">/images/avatar.png</span></span><br><span class="line">  <span class="comment"># If true, the avatar would be dispalyed in circle.</span></span><br><span class="line">  <span class="attr">rounded:</span> <span class="literal">true</span></span><br><span class="line">  <span class="comment"># If true, the avatar would be rotated with the cursor.</span></span><br><span class="line">  <span class="attr">rotated:</span> <span class="literal">true</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里有 rounded 选项是是否显示圆形，rotated 是是否带有旋转效果，大家可以根据喜好选择是否开启。 效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-035817.png" alt=""> 配置完成之后就会显示头像。</p>
                  <h3 id="rss"><a href="#rss" class="headerlink" title="rss"></a>rss</h3>
                  <p>博客一般是需要 RSS 订阅的，如果要开启 RSS 订阅，这里需要安装一个插件，叫做 hexo-generator-feed，安装完成之后，站点会自动生成 RSS Feed 文件，安装命令如下：</p>
                  <figure class="highlight sql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm <span class="keyword">install</span> hexo-generator-feed <span class="comment">--save</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在项目根目录下运行这个命令，安装完成之后不需要其他的配置，以后每次编译生成站点的时候就会自动生成 RSS Feed 文件了。</p>
                  <h3 id="code"><a href="#code" class="headerlink" title="code"></a>code</h3>
                  <p>作为程序猿，代码块的显示还是需要很讲究的，默认的代码块我个人不是特别喜欢，因此我把代码的颜色修改为黑色，并把复制按钮的样式修改为类似 Mac 的样式，修改 _config.yml 文件的 codeblock 区块如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">codeblock:</span></span><br><span class="line">  <span class="comment"># Code Highlight theme</span></span><br><span class="line">  <span class="comment"># Available values: normal | night | night eighties | night blue | night bright</span></span><br><span class="line">  <span class="comment"># See: https://github.com/chriskempson/tomorrow-theme</span></span><br><span class="line">  <span class="attr">highlight_theme:</span> <span class="string">night</span> <span class="string">bright</span></span><br><span class="line">  <span class="comment"># Add copy button on codeblock</span></span><br><span class="line">  <span class="attr">copy_button:</span></span><br><span class="line">    <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">    <span class="comment"># Show text copy result.</span></span><br><span class="line">    <span class="attr">show_result:</span> <span class="literal">true</span></span><br><span class="line">    <span class="comment"># Available values: default | flat | mac</span></span><br><span class="line">    <span class="attr">style:</span> <span class="string">mac</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>修改前的代码样式： <img src="https://qiniu.cuiqingcai.com/2019-09-26-040437.png" alt=""> 修改后的代码样式： <img src="https://qiniu.cuiqingcai.com/2019-09-26-040510.png" alt=""> 嗯，个人觉得整体看起来逼格高了不少。</p>
                  <h3 id="top"><a href="#top" class="headerlink" title="top"></a>top</h3>
                  <p>我们在浏览网页的时候，如果已经看完了想快速返回到网站的上端，一般都是有一个按钮来辅助的，这里也支持它的配置，修改 _config.yml 的 back2top 字段即可，我的设置如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">back2top:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">  <span class="comment"># Back to top in sidebar.</span></span><br><span class="line">  <span class="attr">sidebar:</span> <span class="literal">false</span></span><br><span class="line">  <span class="comment"># Scroll percent label in b2t button.</span></span><br><span class="line">  <span class="attr">scrollpercent:</span> <span class="literal">true</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>enable 默认为 true，即默认显示。sidebar 如果设置为 true，按钮会出现在侧栏下方，个人觉得并不是很好看，就取消了，scrollpercent 就是显示阅读百分比，个人觉得还不错，就将其设置为 true。 具体的效果大家可以设置后根据喜好选择。</p>
                  <h3 id="reading-process"><a href="#reading-process" class="headerlink" title="reading_process"></a>reading_process</h3>
                  <p>reading_process，阅读进度。大家可能注意到有些站点的最上侧会出现一个细细的进度条，代表页面加载进度和阅读进度，如果大家想设置的话也可以试试，我将其打开了，修改 _config.yml 如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">reading_progress:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">  <span class="comment"># Available values: top | bottom</span></span><br><span class="line">  <span class="attr">position:</span> <span class="string">top</span></span><br><span class="line">  <span class="attr">color:</span> <span class="string">"#222"</span></span><br><span class="line">  <span class="attr">height:</span> <span class="string">2px</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>设置之后显示效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-041228.png" alt=""></p>
                  <h3 id="bookmark"><a href="#bookmark" class="headerlink" title="bookmark"></a>bookmark</h3>
                  <p>书签，可以根据阅读历史记录，在下次打开页面的时候快速帮助我们定位到上次的位置，大家可以根据喜好开启和关闭，我的配置如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">bookmark:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">false</span></span><br><span class="line">  <span class="comment"># Customize the color of the bookmark.</span></span><br><span class="line">  <span class="attr">color:</span> <span class="string">"#222"</span></span><br><span class="line">  <span class="comment"># If auto, save the reading progress when closing the page or clicking the bookmark-icon.</span></span><br><span class="line">  <span class="comment"># If manual, only save it by clicking the bookmark-icon.</span></span><br><span class="line">  <span class="attr">save:</span> <span class="string">auto</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="github-banner"><a href="#github-banner" class="headerlink" title="github_banner"></a>github_banner</h3>
                  <p>在一些技术博客上，大家可能注意到在页面的右上角有个 GitHub 图标，点击之后可以跳转到其源码页面，可以为 GitHub Repository 引流，大家如果想显示的话可以自行选择打开，我的配置如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># `Follow me on GitHub` banner in the top-right corner.</span></span><br><span class="line"><span class="attr">github_banner:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">  <span class="attr">permalink:</span> <span class="string">https://github.com/NightTeam/nightteam.github.io</span></span><br><span class="line">  <span class="attr">title:</span> <span class="string">NightTeam</span> <span class="string">GitHub</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>记得修改下链接 permalink 和标题 title，显示效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-041726.png" alt=""> 可以看到在页面右上角显示了 GitHub 的图标，点击可以进去到 Repository 页面。</p>
                  <h3 id="gitalk"><a href="#gitalk" class="headerlink" title="gitalk"></a>gitalk</h3>
                  <p>由于 Hexo 的博客是静态博客，而且也没有连接数据库的功能，所以它的评论功能是不能自行集成的，但可以集成第三方的服务。 Next 主题里面提供了多种评论插件的集成，有 changyan | disqus | disqusjs | facebook_comments_plugin | gitalk | livere | valine | vkontakte 这些。 作为一名程序员，我个人比较喜欢 gitalk，它是利用 GitHub 的 Issue 来当评论，样式也比较不错。 首先需要在 GitHub 上面注册一个 OAuth Application，链接为：<a href="https://github.com/settings/applications/new，注册完毕之后拿到" target="_blank" rel="noopener">https://github.com/settings/applications/new，注册完毕之后拿到</a> Client ID、Client Secret 就可以了。 首先需要在 _config.yml 文件的 comments 区域配置使用 gitalk：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># Multiple Comment System Support</span></span><br><span class="line">comments:</span><br><span class="line">  # Available values: tabs | buttons</span><br><span class="line">  style: tabs</span><br><span class="line">  # Choose a comment<span class="built_in"> system </span><span class="keyword">to</span> be displayed by default.</span><br><span class="line">  # Available values: changyan | disqus | disqusjs | facebook_comments_plugin | gitalk | livere | valine | vkontakte</span><br><span class="line">  active: gitalk</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>主要是 comments.active 字段选择对应的名称即可。 然后找打 gitalk 配置，添加它的各项配置：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># Gitalk</span></span><br><span class="line"><span class="comment"># Demo: https://gitalk.github.io</span></span><br><span class="line"><span class="comment"># For more information: https://github.com/gitalk/gitalk</span></span><br><span class="line">gitalk:</span><br><span class="line">  enable: <span class="literal">true</span></span><br><span class="line">  github_id: NightTeam</span><br><span class="line">  repo: nightteam.github.io # Repository name <span class="keyword">to</span> store issues</span><br><span class="line">  client_id: &#123;your<span class="built_in"> client </span>id&#125; # GitHub Application<span class="built_in"> Client </span>ID</span><br><span class="line">  client_secret: &#123;your<span class="built_in"> client </span>secret&#125; # GitHub Application<span class="built_in"> Client </span>Secret</span><br><span class="line">  admin_user: germey # GitHub repo owner <span class="keyword">and</span> collaborators, only these guys can initialize gitHub issues</span><br><span class="line">  distraction_free_mode: <span class="literal">true</span> # Facebook-like distraction free mode</span><br><span class="line">  # Gitalk<span class="string">'s display language depends on user'</span>s browser <span class="keyword">or</span><span class="built_in"> system </span>environment</span><br><span class="line">  # <span class="keyword">If</span> you want everyone visiting your site <span class="keyword">to</span> see a uniform language, you can <span class="builtin-name">set</span> a force language value</span><br><span class="line">  # Available values: en | es-ES | fr | ru | zh-CN | zh-TW</span><br><span class="line">  language: zh-CN</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>配置完成之后 gitalk 就可以使用了，点击进入文章页面，就会出现如下页面： <img src="https://qiniu.cuiqingcai.com/2019-09-26-043331.png" alt=""> GitHub 授权登录之后就可以使用了，评论的内容会自动出现在 Issue 里面。</p>
                  <h3 id="pangu"><a href="#pangu" class="headerlink" title="pangu"></a>pangu</h3>
                  <p>我个人有个强迫症，那就是写中文和英文的时候中间必须要留有间距，一个简单直接的方法就是中间加个空格，但某些情况下可能习惯性不加或者忘记加了，这就导致中英文混排并不是那么美观。 pangu 就是来解决这个问题的，我们只需要在主题里面开启这个选项，在编译生成页面的时候，中英文之间就会自动添加空格，看起来更加美观。 具体的修改如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">pangu:</span> <span class="literal">true</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="math"><a href="#math" class="headerlink" title="math"></a>math</h3>
                  <p>可能在一些情况下我们需要写一个公式，比如演示一个算法推导过程，MarkDown 是支持公式显示的，Hexo 的 Next 主题同样是支持的。 Next 主题提供了两个渲染引擎，分别是 mathjax 和 katex，后者相对前者来说渲染速度更快，而且不需要 JavaScript 的额外支持，但后者支持的功能现在还不如前者丰富，具体的对比可以看官方文档：<a href="https://theme-next.org/docs/third-party-services/math-equations" target="_blank" rel="noopener">https://theme-next.org/docs/third-party-services/math-equations</a>。 所以我这里选择了 mathjax，通过修改配置即可启用：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">math:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line"></span><br><span class="line">  <span class="comment"># Default (true) will load mathjax / katex script on demand.</span></span><br><span class="line">  <span class="comment"># That is it only render those page which has `mathjax: true` in Front-matter.</span></span><br><span class="line">  <span class="comment"># If you set it to false, it will load mathjax / katex srcipt EVERY PAGE.</span></span><br><span class="line">  <span class="attr">per_page:</span> <span class="literal">true</span></span><br><span class="line"></span><br><span class="line">  <span class="comment"># hexo-renderer-pandoc (or hexo-renderer-kramed) required for full MathJax support.</span></span><br><span class="line">  <span class="attr">mathjax:</span></span><br><span class="line">    <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">    <span class="comment"># See: https://mhchem.github.io/MathJax-mhchem/</span></span><br><span class="line">    <span class="attr">mhchem:</span> <span class="literal">true</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>mathjax 的使用需要我们额外安装一个插件，叫做 hexo-renderer-kramed，另外也可以安装 hexo-renderer-pandoc，命令如下：</p>
                  <figure class="highlight ada">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm un hexo-renderer-marked <span class="comment">--save</span></span><br><span class="line">npm i hexo-renderer-kramed <span class="comment">--save</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>另外还有其他的插件支持，大家可以到官方文档查看。</p>
                  <h3 id="pjax"><a href="#pjax" class="headerlink" title="pjax"></a>pjax</h3>
                  <p>可能大家听说过 Ajax，没听说过 pjax，这个技术实际上就是利用 Ajax 技术实现了局部页面刷新，既可以实现 URL 的更换，有可以做到无刷新加载。 要开启这个功能需要先将 pjax 功能开启，然后安装对应的 pjax 依赖库，首先修改 _config.yml 修改如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">pjax:</span> <span class="literal">true</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后安装依赖库，切换到 next 主题下，然后安装依赖库：</p>
                  <figure class="highlight crystal">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">$ cd themes/<span class="keyword">next</span></span><br><span class="line">$ git clone <span class="symbol">https:</span>/<span class="regexp">/github.com/theme</span>-<span class="keyword">next</span>/theme-<span class="keyword">next</span>-pjax source/<span class="class"><span class="keyword">lib</span>/<span class="title">pjax</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样 pjax 就开启了，页面就可以实现无刷新加载了。 另外关于 Next 主题的设置还有挺多的，这里就介绍到这里了，更多的主题设置大家可以参考官方文档：<a href="https://theme-next.org/docs/" target="_blank" rel="noopener">https://theme-next.org/docs/</a>。</p>
                  <h2 id="文章"><a href="#文章" class="headerlink" title="文章"></a>文章</h2>
                  <p>现在整个站点只有一篇文章，那么我们怎样来增加其他的文章呢？ 这个很简单，只需要调用 Hexo 提供的命令即可，比如我们要新建一篇「HelloWorld」的文章，命令如下：</p>
                  <figure class="highlight haxe">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo <span class="keyword">new</span> <span class="type">hello</span>-world</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>创建的文章会出现在 <code>source/_posts</code> 文件夹下，是 MarkDown 格式。 在文章开头通过如下格式添加必要信息：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="string">\---</span></span><br><span class="line"><span class="attr">title:</span> <span class="string">标题</span> <span class="comment"># 自动创建，如 hello-world</span></span><br><span class="line"><span class="attr">date:</span> <span class="string">日期</span> <span class="comment"># 自动创建，如 2019-09-22 01:47:21</span></span><br><span class="line"><span class="attr">tags:</span> </span><br><span class="line"><span class="bullet">-</span> <span class="string">标签1</span></span><br><span class="line"><span class="bullet">-</span> <span class="string">标签2</span></span><br><span class="line"><span class="bullet">-</span> <span class="string">标签3</span></span><br><span class="line"><span class="attr">categories:</span></span><br><span class="line"><span class="bullet">-</span> <span class="string">分类1</span></span><br><span class="line"><span class="bullet">-</span> <span class="string">分类2</span></span><br><span class="line"><span class="meta">---</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>开头下方撰写正文，MarkDown 格式书写即可。 这样在下次编译的时候就会自动识别标题、时间、类别等等，另外还有其他的一些参数设置，可以参考文档：<a href="https://hexo.io/zh-cn/docs/writing.html" target="_blank" rel="noopener">https://hexo.io/zh-cn/docs/writing.html</a>。</p>
                  <h2 id="标签页"><a href="#标签页" class="headerlink" title="标签页"></a>标签页</h2>
                  <p>现在我们的博客只有首页、文章页，如果我们想要增加标签页，可以自行添加，这里 Hexo 也给我们提供了这个功能，在根目录执行命令如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo new<span class="built_in"> page </span>tags</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>执行这个命令之后会自动帮我们生成一个 source/tags/index.md 文件，内容就只有这样子的：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="string">\---</span></span><br><span class="line"><span class="attr">title:</span> <span class="string">tags</span></span><br><span class="line"><span class="attr">date:</span> <span class="number">2019</span><span class="number">-09</span><span class="number">-26</span> <span class="number">16</span><span class="string">:44:17</span></span><br><span class="line"><span class="meta">---</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>我们可以自行添加一个 type 字段来指定页面的类型：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">type:</span> <span class="string">tags</span></span><br><span class="line"><span class="attr">comments:</span> <span class="literal">false</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后再在主题的 _config.yml 文件将这个页面的链接添加到主菜单里面，修改 menu 字段如下：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">menu:</span></span><br><span class="line"><span class="symbol">  home:</span> / || home</span><br><span class="line">  <span class="meta">#about: /about/ || user</span></span><br><span class="line"><span class="symbol">  tags:</span> <span class="meta-keyword">/tags/</span> || tags</span><br><span class="line">  <span class="meta">#categories: /categories/ || th</span></span><br><span class="line"><span class="symbol">  archives:</span> <span class="meta-keyword">/archives/</span> || archive</span><br><span class="line">  <span class="meta">#schedule: /schedule/ || calendar</span></span><br><span class="line">  <span class="meta">#sitemap: /sitemap.xml || sitemap</span></span><br><span class="line">  <span class="meta">#commonweal: /404/ || heartbeat</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样重新本地启动看下页面状态，效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-085417.png" alt=""> 可以看到左侧导航也出现了标签，点击之后右侧会显示标签的列表。</p>
                  <h3 id="分类页"><a href="#分类页" class="headerlink" title="分类页"></a>分类页</h3>
                  <p>分类功能和标签类似，一个文章可以对应某个分类，如果要增加分类页面可以使用如下命令创建：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo new<span class="built_in"> page </span>categories</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后同样地，会生成一个 source/categories/index.md 文件。 我们可以自行添加一个 type 字段来指定页面的类型：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">type:</span> <span class="string">categories</span></span><br><span class="line"><span class="attr">comments:</span> <span class="literal">false</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后再在主题的 _config.yml 文件将这个页面的链接添加到主菜单里面，修改 menu 字段如下：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">menu:</span></span><br><span class="line"><span class="symbol">  home:</span> / || home</span><br><span class="line">  <span class="meta">#about: /about/ || user</span></span><br><span class="line"><span class="symbol">  tags:</span> <span class="meta-keyword">/tags/</span> || tags</span><br><span class="line"><span class="symbol">  categories:</span> <span class="meta-keyword">/categories/</span> || th</span><br><span class="line"><span class="symbol">  archives:</span> <span class="meta-keyword">/archives/</span> || archive</span><br><span class="line">  <span class="meta">#schedule: /schedule/ || calendar</span></span><br><span class="line">  <span class="meta">#sitemap: /sitemap.xml || sitemap</span></span><br><span class="line">  <span class="meta">#commonweal: /404/ || heartbeat</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样页面就会增加分类的支持，效果如下： <img src="https://qiniu.cuiqingcai.com/2019-09-26-085755.png" alt=""></p>
                  <h2 id="搜索页"><a href="#搜索页" class="headerlink" title="搜索页"></a>搜索页</h2>
                  <p>很多情况下我们需要搜索全站的内容，所以一个搜索功能的支持也是很有必要的。 如果要添加搜索的支持，需要先安装一个插件，叫做 hexo-generator-searchdb，命令如下：</p>
                  <figure class="highlight sql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">npm <span class="keyword">install</span> hexo-generator-searchdb <span class="comment">--save</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后在项目的 _config.yml 里面添加搜索设置如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">search:</span></span><br><span class="line">  <span class="attr">path:</span> <span class="string">search.xml</span></span><br><span class="line">  <span class="attr">field:</span> <span class="string">post</span></span><br><span class="line">  <span class="attr">format:</span> <span class="string">html</span></span><br><span class="line">  <span class="attr">limit:</span> <span class="number">10000</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后在主题的 _config.yml 里面修改如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># Local search</span></span><br><span class="line"><span class="comment"># Dependencies: https://github.com/wzpan/hexo-generator-search</span></span><br><span class="line"><span class="attr">local_search:</span></span><br><span class="line">  <span class="attr">enable:</span> <span class="literal">true</span></span><br><span class="line">  <span class="comment"># If auto, trigger search by changing input.</span></span><br><span class="line">  <span class="comment"># If manual, trigger search by pressing enter key or search button.</span></span><br><span class="line">  <span class="attr">trigger:</span> <span class="string">auto</span></span><br><span class="line">  <span class="comment"># Show top n results per article, show all results by setting to -1</span></span><br><span class="line">  <span class="attr">top_n_per_article:</span> <span class="number">5</span></span><br><span class="line">  <span class="comment"># Unescape html strings to the readable one.</span></span><br><span class="line">  <span class="attr">unescape:</span> <span class="literal">false</span></span><br><span class="line">  <span class="comment"># Preload the search data when the page loads.</span></span><br><span class="line">  <span class="attr">preload:</span> <span class="literal">false</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里用的是 Local Search，如果想启用其他是 Search Service 的话可以参考官方文档：<a href="https://theme-next.org/docs/third-party-services/search-services" target="_blank" rel="noopener">https://theme-next.org/docs/third-party-services/search-services</a>。</p>
                  <h2 id="404-页面"><a href="#404-页面" class="headerlink" title="404 页面"></a>404 页面</h2>
                  <p>另外还需要添加一个 404 页面，直接在根目录 source 文件夹新建一个 404.md 文件即可，内容可以仿照如下：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="string">\---</span></span><br><span class="line"><span class="attr">title:</span> <span class="number">404</span> <span class="string">Not</span> <span class="string">Found</span></span><br><span class="line"><span class="attr">date:</span> <span class="number">2019</span><span class="number">-09</span><span class="number">-22</span> <span class="number">10</span><span class="string">:41:27</span></span><br><span class="line"><span class="meta">---</span></span><br><span class="line"></span><br><span class="line"><span class="string">&lt;center&gt;</span></span><br><span class="line"><span class="string">对不起，您所访问的页面不存在或者已删除。</span></span><br><span class="line"><span class="string">您可以&lt;a</span> <span class="string">href="https://blog.nightteam.cn&gt;"&gt;点击此处&lt;/a&gt;返回首页。</span></span><br><span class="line"><span class="string">&lt;/center&gt;</span></span><br><span class="line"></span><br><span class="line"><span class="string">&lt;blockquote</span> <span class="string">class="blockquote-center"&gt;</span></span><br><span class="line">    <span class="string">NightTeam</span></span><br><span class="line"><span class="string">&lt;/blockquote&gt;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里面的一些相关信息和链接可以替换成自己的。 增加了这个 404 页面之后就可以 完成了上面的配置基本就完成了大半了，其实 Hexo 还有很多很多功能，这里就介绍不过来了，大家可以直接参考官方文档：<a href="https://hexo.io/zh-cn/docs/" target="_blank" rel="noopener">https://hexo.io/zh-cn/docs/</a> 查看更多的配置。</p>
                  <h2 id="部署脚本"><a href="#部署脚本" class="headerlink" title="部署脚本"></a>部署脚本</h2>
                  <p>最后我这边还增加了一个简易版的部署脚本，其实就是重新 gererate 下文件，然后重新部署。在根目录下新建一个 deploy.sh 的脚本文件，内容如下：</p>
                  <figure class="highlight verilog">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">hexo clean</span><br><span class="line">hexo <span class="keyword">generate</span></span><br><span class="line">hexo deploy</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样我们在部署发布的时候只需要执行：</p>
                  <figure class="highlight stata">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">sh</span> deploy.<span class="keyword">sh</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>就可以完成博客的更新了，非常简单。</p>
                  <h2 id="自定义域名"><a href="#自定义域名" class="headerlink" title="自定义域名"></a>自定义域名</h2>
                  <p>将页面修改之后可以用上面的脚本重新部署下博客，其内容便会跟着更新。 另外我们也可以在 GitHub 的 Repository 里面设置域名，找到 Settings，拉到下面，可以看到有个 GitHub Pages 的配置项，如图所示： <img src="https://qiniu.cuiqingcai.com/2019-09-26-112622.png" alt=""> 下面有个 custom domain 的选项，输入你想自定义的域名地址，然后添加 CNAME 解析就好了。 另外下面还有一个 Enforce HTTPS 的选项，GitHub Pages 会在我们配置自定义域名之后自动帮我们配置 HTTPS 服务。刚配置完自定义域名的时候可能这个选项是不可用的，一段时间后等到其可以勾选了，直接勾选即可，这样整个博客就会变成 HTTPS 的协议的了。 另外有一个值得注意的地方，如果配置了自定义域名，在目前的情况下，每次部署的时候这个自定义域名的设置是会被自动清除的。所以为了避免这个情况，我们需要在项目目录下面新建一个 CNAME 文件，路径为 source/CNAME，内容就是自定义域名。 比如我就在 source 目录下新建了一个 CNAME 文件，内容为：</p>
                  <figure class="highlight css">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-tag">blog</span><span class="selector-class">.nightteam</span><span class="selector-class">.cn</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样避免了每次部署的时候自定义域名被清除的情况了。 以上就是从零搭建一个 Hexo 博客的流程，希望对大家有帮助。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-30 10:41:36" itemprop="dateCreated datePublished" datetime="2019-09-30T10:41:36+08:00">2019-09-30</time>
                </span>
                <span id="/7625.html" class="post-meta-item leancloud_visitors" data-flag-title="利用 GitHub + Hexo + Next 从零搭建一个博客" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>14k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>13 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7544.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7544.html" class="post-title-link" itemprop="url">AndServer+Service 打造 Android 服务器实现 so 文件调用</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <h4 id="so-文件调用"><a href="#so-文件调用" class="headerlink" title="so 文件调用"></a>so 文件调用</h4>
                  <p>随着 Android 移动安全的高速发展，不管是为了执行效率还是程序的安全性等，关键代码下沉 native 层已成为基本操作。 native 层的开发就是通指的 JNI/NDK 开发，通过 JNI 可以实现 java 层和 native 层（主要是 C/C++ ）的相互调用，native 层经编译后产生 so 动态链接库，so 文件具有可移植性广，执行效率高，保密性强等优点。 那么问题来了，如何调用 so 文件显得异常重要，当然你也可以直接分析 so 文件的伪代码，利用强悍的编程功底直接模拟关键操作，但是我想对于普通人来说头发还是比较重要的。 当前调用 so 文件的主流操作应该是： 1，基于 Unicorn 的各种实现（还在学习中，暂且不表） 2，Android 服务器的搭建，在 App 内起 http 服务完成调用 so 的需求（当然前提是过了 so 的效验等操作） 至于为什么选用 AndServer，好吧，不为什么，只是因为搜索到了它 为什么结合 Service，在学习 Android 开发的时候了解到了 Service 的生命周期，个人理解用 Service 去创建 Http 服务比较好。 当然也有 Application 的简单使用，因为在正式环境中，大多数 so 文件的逻辑中都有 context 的一些包名了，签名了的效验等，自定义 Application 的话获取 context 传参就好了。</p>
                  <h4 id="libyemu-so-简介"><a href="#libyemu-so-简介" class="headerlink" title="libyemu.so 简介"></a>libyemu.so 简介</h4>
                  <p>这是我编译好的一个 so 文件，就是根据入参做下简单的字符串拼接（以下是 native 层编译前的 c 代码）</p>
                  <figure class="highlight reasonml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">extern <span class="string">"C"</span></span><br><span class="line">JNIEXPORT jstring JNICALL</span><br><span class="line"><span class="constructor">Java_com_fw_myapplication_ndktest_NdkTest_stringFromUTF(JNIEnv <span class="operator">*</span><span class="params">env</span>, <span class="params">jobject</span> <span class="params">instance</span>, <span class="params">jstring</span> <span class="params">str_</span>)</span> &#123;</span><br><span class="line">    jclass String_clazz = env-&gt;<span class="constructor">FindClass(<span class="string">"java/lang/String"</span>)</span>;</span><br><span class="line"></span><br><span class="line">    jmethodID concat_methodID = env-&gt;<span class="constructor">GetMethodID(String_clazz, <span class="string">"concat"</span>, <span class="string">"(Ljava/lang/String;)Ljava/lang/String;"</span>)</span>;</span><br><span class="line"></span><br><span class="line">    jstring str = env-&gt;<span class="constructor">NewStringUTF(<span class="string">"  from so --[NightTeam夜幕]"</span>)</span>;</span><br><span class="line"></span><br><span class="line">    jobject str1 = env-&gt;<span class="constructor">CallObjectMethod(<span class="params">str_</span>, <span class="params">concat_methodID</span>, <span class="params">str</span>)</span>;</span><br><span class="line"></span><br><span class="line">    const <span class="built_in">char</span> *chars = env-&gt;<span class="constructor">GetStringUTFChars((<span class="params">jstring</span>)</span>str1, <span class="number">0</span>);</span><br><span class="line"></span><br><span class="line">    return env-&gt;<span class="constructor">NewStringUTF(<span class="params">chars</span>)</span>;</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这部分代码还是有必要贴一下的，简单的静态注册使用了反射的思想，反射在逆向中至关重要 接下来是 java 代码，定义了 native 函数</p>
                  <figure class="highlight arduino">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">package com.fw.myapplication.ndktest;</span><br><span class="line"></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">NdkTest</span> &#123;</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> native <span class="keyword">String</span> <span class="title">stringFromUTF</span><span class="params">(<span class="keyword">String</span> str)</span></span>;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">static</span> &#123;</span><br><span class="line">        System.loadLibrary(<span class="string">"yemu"</span>);</span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>如果到这里有点懵逼的同学可能需要去补下 Android 开发基础了</p>
                  <h4 id="Android-项目测试-so"><a href="#Android-项目测试-so" class="headerlink" title="Android 项目测试 so"></a>Android 项目测试 so</h4>
                  <p>先说下我的环境，因为这个环境影响太大了 1，AndroidStudio 3.4 2，手机 Android 6 架构 armeabi-v7a 打开 AndroidStudio 新建 project <img src="http://pxx0jx04s.bkt.clouddn.com/FnI6Y8rCEUPsGF-Crl6fHbS1Jj7O" alt=""> 在 module 的 build 中加这么一句，然后 sync <img src="http://pxx0jx04s.bkt.clouddn.com/Fgsl4Zg7LocRkHREXLpBUaJDKsDB" alt=""> 把编译好的 so 文件复制到 libs 文件夹下（和刚才的 jniLibs.srcDirs 对应） <img src="http://pxx0jx04s.bkt.clouddn.com/FjUP5ssOtJSkb-Qas0liRT9yseIG" alt=""> 把 so 对应的 java 代码也 copy 过来，注意包名类名的一致性 <img src="http://pxx0jx04s.bkt.clouddn.com/FqHDAvfRfENxubUUG16galS4NPX8" alt=""> 打开 activity_main.xml 文件为 TextView 添加 id <img src="http://pxx0jx04s.bkt.clouddn.com/Fv-pvgJ3-5DL7P_0kvWZ6uRFODpJ" alt=""> 打开 MainActiviy.java 开始编码 <img src="http://pxx0jx04s.bkt.clouddn.com/FtMaZriK312NMGuHHfnslHxH-DRq" alt=""> 这两行的意思就是，先从布局中找到对应 id 的 TextView，然后为其设置 Text（调用 native 函数的返回值） 下面测试一下咱们的 so 调用情况 <img src="http://pxx0jx04s.bkt.clouddn.com/FlZPSXoC-nljUM-57UYm7IBh78F8" alt=""> 可以看到咱们的 so 文件调用成功（这里咱们的 so 没有效验，只是测试 app 是否可以正常调用）</p>
                  <h4 id="AndServer-代码编写"><a href="#AndServer-代码编写" class="headerlink" title="AndServer 代码编写"></a>AndServer 代码编写</h4>
                  <p>AndServer 官方文档：<a href="https://yanzhenjie.com/AndServer/" target="_blank" rel="noopener">https://yanzhenjie.com/AndServer/</a> 打开官方文档，看看人家的入门介绍，新建 java 文件 <img src="http://pxx0jx04s.bkt.clouddn.com/FpwqVRWawPLsdE4BkB4vk51YeEfj" alt=""> 如图经典 MVC 的 C 就写好了，定义了一个 nightteam_sign 接口，请求方式为 get，请求参数为 sign，调用 native 函数，然后返回 json，但是这里我想利用 Application 获取下 context 对象，取下包名，接下来自定义 Applictaion</p>
                  <figure class="highlight scala">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">package</span> com.nightteam.httpso;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> android.app.<span class="type">Application</span>;</span><br><span class="line"></span><br><span class="line">public <span class="class"><span class="keyword">class</span> <span class="title">MyApp</span> <span class="keyword">extends</span> <span class="title">Application</span> </span>&#123;</span><br><span class="line">    <span class="keyword">private</span> static <span class="type">MyApp</span> myApp;</span><br><span class="line">    public static <span class="type">MyApp</span> getInstance() &#123;</span><br><span class="line">        <span class="keyword">return</span> myApp;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@Override</span></span><br><span class="line">    public void onCreate() &#123;</span><br><span class="line">        <span class="keyword">super</span>.onCreate();</span><br><span class="line">        myApp = <span class="keyword">this</span>;</span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>然后在 manifest 文件中指定要启动的 Application <img src="http://pxx0jx04s.bkt.clouddn.com/FoiTsSgJMVTp5IPlTpk899AfjE-v" alt=""> 然后修改 MyController.java 的代码 <img src="http://pxx0jx04s.bkt.clouddn.com/FgO0OfbGufmBi__W0HBQ1PqTfo-w" alt=""> 接下来把官方文档-服务器的代码 copy 下来 导入一些包，修改部分代码如下 <img src="http://pxx0jx04s.bkt.clouddn.com/Flpyzen_6rJH_YOM86WS6-57AE2L" alt=""> 新版本的 AndServer.serverBuilder 已经需要传递 context 了，这里把网络地址和端口号也修改为从构造参数中获取，到这里 AndServer 的东西基本完了，实际上咱们就搭建一个调 so 的接口，并没有过多的业务逻辑，所以代码就是使用的最简单的</p>
                  <h4 id="Service-代码编写"><a href="#Service-代码编写" class="headerlink" title="Service 代码编写"></a>Service 代码编写</h4>
                  <p>咱们这里用按钮的点击事件启动 Service，故在 activity_main.xml 中添加一个 button 并指定点击事件 <img src="http://pxx0jx04s.bkt.clouddn.com/Fh7PsV-Ha67UumUhNt_7HfoziS0G" alt=""> <img src="http://pxx0jx04s.bkt.clouddn.com/FiB44r5XTQIKOZ4wbNtAUaql_PpL" alt=""> 接下来编写自定义 Service 代码</p>
                  <figure class="highlight java">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">package</span> com.nightteam.httpso.Service;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> android.app.Service;</span><br><span class="line"><span class="keyword">import</span> android.content.Intent;</span><br><span class="line"><span class="keyword">import</span> android.os.IBinder;</span><br><span class="line"><span class="keyword">import</span> android.util.Log;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> com.nightteam.httpso.ServerManager;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> java.net.InetAddress;</span><br><span class="line"><span class="keyword">import</span> java.net.UnknownHostException;</span><br><span class="line"></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">MyService</span> <span class="keyword">extends</span> <span class="title">Service</span> </span>&#123;</span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> String TAG = <span class="string">"NigthTeam"</span>;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@Override</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">onCreate</span><span class="params">()</span> </span>&#123;</span><br><span class="line">        <span class="keyword">super</span>.onCreate();</span><br><span class="line">        Log.d(TAG, <span class="string">"onCreate: MyService"</span>);</span><br><span class="line">        <span class="keyword">new</span> Thread() &#123;</span><br><span class="line">            <span class="meta">@Override</span></span><br><span class="line">            <span class="function"><span class="keyword">public</span> <span class="keyword">void</span> <span class="title">run</span><span class="params">()</span> </span>&#123;</span><br><span class="line">                <span class="keyword">super</span>.run();</span><br><span class="line">                InetAddress inetAddress = <span class="keyword">null</span>;</span><br><span class="line">                <span class="keyword">try</span> &#123;</span><br><span class="line">                    inetAddress = InetAddress.getByName(<span class="string">"0.0.0.0"</span>);</span><br><span class="line">                    Log.d(TAG, <span class="string">"onCreate: "</span> + inetAddress.getHostAddress());</span><br><span class="line">                    ServerManager serverManager = <span class="keyword">new</span> ServerManager(getApplicationContext(), inetAddress, <span class="number">8005</span>);</span><br><span class="line">                    serverManager.startServer();</span><br><span class="line">                &#125; <span class="keyword">catch</span> (UnknownHostException e) &#123;</span><br><span class="line">                    e.printStackTrace();</span><br><span class="line">                &#125;</span><br><span class="line"></span><br><span class="line">            &#125;</span><br><span class="line">        &#125;.start();</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="meta">@Override</span></span><br><span class="line">    <span class="function"><span class="keyword">public</span> IBinder <span class="title">onBind</span><span class="params">(Intent intent)</span> </span>&#123;</span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>打上了几个 log，在子线程中启动 AndServer 的服务（何时使用 UI 线程和子线程是 Android 基础，这里就不赘述了） 注意一下，这里从 0.0.0.0 获取 inetAddress，可不要写错了，localhost 和 0.0.0.0 的区别请移步搜索引擎 然后就是向 ServerManager 的构造函数传递 context，inetAddress，port 用来 new 对象，随后开启服务 最后注意检查下 manifest 文件中 Service 的声明 <img src="http://pxx0jx04s.bkt.clouddn.com/FrxfrVW-TP5EGegkMIlpKNiiTyqI" alt=""></p>
                  <h4 id="开启-Service，并获取本机-ip"><a href="#开启-Service，并获取本机-ip" class="headerlink" title="开启 Service，并获取本机 ip"></a>开启 Service，并获取本机 ip</h4>
                  <p>回到我们的 MainActivity.java 的 operate（ button 的点击事件）编写启动 Service 代码</p>
                  <figure class="highlight reasonml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">public void operate(View view) &#123;</span><br><span class="line">       switch (view.get<span class="constructor">Id()</span>)&#123;</span><br><span class="line">           case <span class="module-access"><span class="module"><span class="identifier">R</span>.</span></span>id.id_bt_index:</span><br><span class="line">               <span class="comment">//启动服务:创建--&gt;启动--&gt;销毁</span></span><br><span class="line">               <span class="comment">//如果服务已经创建了，后续重复启动，操作的都是同一个服务，不会再重新创建了，除非你先销毁它</span></span><br><span class="line">               Intent it1 = <span class="keyword">new</span> <span class="constructor">Intent(<span class="params">this</span>, MyService.<span class="params">class</span>)</span>;</span><br><span class="line">               <span class="module-access"><span class="module"><span class="identifier">Log</span>.</span></span>d(TAG, <span class="string">"operate: button"</span>);</span><br><span class="line">               start<span class="constructor">Service(<span class="params">it1</span>)</span>;</span><br><span class="line">               ((Button) view).set<span class="constructor">Text(<span class="string">"服务已开启"</span>)</span>;</span><br><span class="line">               break;</span><br><span class="line">       &#125;</span><br><span class="line">   &#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>到这里我们的服务基本搭建好了，但是为了方便起见，我想把咱们的本机 ip 显示在 App 上，这样我们就不用去设置再查看了 我在网上找到了一个获取 ip 地址的一个工具类，源码如下:</p>
                  <figure class="highlight java">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">package</span> com.nightteam.httpso;</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> java.net.InetAddress;</span><br><span class="line"><span class="keyword">import</span> java.net.NetworkInterface;</span><br><span class="line"><span class="keyword">import</span> java.net.SocketException;</span><br><span class="line"><span class="keyword">import</span> java.util.Enumeration;</span><br><span class="line"><span class="keyword">import</span> java.util.regex.Pattern;</span><br><span class="line"></span><br><span class="line"><span class="keyword">public</span> <span class="class"><span class="keyword">class</span> <span class="title">NetUtils</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">    <span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">final</span> Pattern IPV4_PATTERN = Pattern.compile(<span class="string">"^("</span> +</span><br><span class="line"></span><br><span class="line">            <span class="string">"([0-9]|[1-9][0-9]|1[0-9]&#123;2&#125;|2[0-4][0-9]|25[0-5])\\.)&#123;3&#125;"</span> +</span><br><span class="line"></span><br><span class="line">            <span class="string">"([0-9]|[1-9][0-9]|1[0-9]&#123;2&#125;|2[0-4][0-9]|25[0-5])$"</span>);</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">private</span> <span class="keyword">static</span> <span class="keyword">boolean</span> <span class="title">isIPv4Address</span><span class="params">(String input)</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">return</span> IPV4_PATTERN.matcher(input).matches();</span><br><span class="line"></span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line">    <span class="comment">//获取本机IP地址</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">public</span> <span class="keyword">static</span> InetAddress <span class="title">getLocalIPAddress</span><span class="params">()</span> </span>&#123;</span><br><span class="line"></span><br><span class="line">        Enumeration&lt;NetworkInterface&gt; enumeration = <span class="keyword">null</span>;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">try</span> &#123;</span><br><span class="line"></span><br><span class="line">            enumeration = NetworkInterface.getNetworkInterfaces();</span><br><span class="line"></span><br><span class="line">        &#125; <span class="keyword">catch</span> (SocketException e) &#123;</span><br><span class="line"></span><br><span class="line">            e.printStackTrace();</span><br><span class="line"></span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">if</span> (enumeration != <span class="keyword">null</span>) &#123;</span><br><span class="line"></span><br><span class="line">            <span class="keyword">while</span> (enumeration.hasMoreElements()) &#123;</span><br><span class="line"></span><br><span class="line">                NetworkInterface nif = enumeration.nextElement();</span><br><span class="line"></span><br><span class="line">                Enumeration&lt;InetAddress&gt; inetAddresses = nif.getInetAddresses();</span><br><span class="line"></span><br><span class="line">                <span class="keyword">if</span> (inetAddresses != <span class="keyword">null</span>)</span><br><span class="line"></span><br><span class="line">                    <span class="keyword">while</span> (inetAddresses.hasMoreElements()) &#123;</span><br><span class="line"></span><br><span class="line">                        InetAddress inetAddress = inetAddresses.nextElement();</span><br><span class="line"></span><br><span class="line">                        <span class="keyword">if</span> (!inetAddress.isLoopbackAddress() &amp;&amp; isIPv4Address(inetAddress.getHostAddress())) &#123;</span><br><span class="line"></span><br><span class="line">                            <span class="keyword">return</span> inetAddress;</span><br><span class="line"></span><br><span class="line">                        &#125;</span><br><span class="line"></span><br><span class="line">                    &#125;</span><br><span class="line">            &#125;</span><br><span class="line"></span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">null</span>;</span><br><span class="line"></span><br><span class="line">    &#125;</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>把工具类 copy 到我们的 Android 项目中，继续在 MainActivity.java 中编码 <img src="http://pxx0jx04s.bkt.clouddn.com/FgJn1_mmO3xfu8bC4tdiM-uUHM-w" alt=""> 获取了一下本机地址和 Android SDK 版本（ Android 8 之后启动 Service 方式不一样）</p>
                  <h4 id="申请权限，启动-App"><a href="#申请权限，启动-App" class="headerlink" title="申请权限，启动 App"></a>申请权限，启动 App</h4>
                  <p>最后一步就是为 app 申请网络权限了 <img src="http://pxx0jx04s.bkt.clouddn.com/FsWGhs2q4d-zwYjEZJXYIc8LJor9" alt=""> 随后连接我们的手机，运行项目，测试一下，点击开启服务 <img src="http://pxx0jx04s.bkt.clouddn.com/Fkon0mUBUUpqbbsDaQPvdc_xb2aE" alt=""> 看下 AndroidStudio 日志 <img src="http://pxx0jx04s.bkt.clouddn.com/Fl17ks9HRNRDTi-FeHAHz9GRNFXF" alt=""> 好像一切正常，在浏览器访问下试试（ ip 就是 App 中显示的 ip 地址） <img src="http://pxx0jx04s.bkt.clouddn.com/FolDpPh19DiCx3b5eKxT8EHCTZVs" alt=""> 如图正常访问到了我们想要的内容 回过头来说下 Service，打开我们手机的设置，找到应用程序管理-运行中的服务（手机不同，方式不同） <img src="http://pxx0jx04s.bkt.clouddn.com/FuKPa8wpqZ41LMC8vmCS3QTol2GN" alt=""> 可以看到我们的程序，运行了一个服务，这个服务就是咱们编码的 MyService <img src="http://pxx0jx04s.bkt.clouddn.com/Fp461KQ3wzQ_WYMW0XTO1i4bGebI" alt=""> 接下来杀掉该 App进程，再次查看运行中的服务 <img src="http://pxx0jx04s.bkt.clouddn.com/Ft3ZLJIdLXcfd-zwp4K3zVfTXta9" alt=""> 我这里在权限管理设置了自动运行，可以保持服务的运行。（这个地方还是根据系统有大小差异） 至此使用 App 起 http 服务调 so 就完成了</p>
                  <hr>
                  <p>好了，上面就是利用 AndServer 打造 Android 服务器调 so 文件的整体思路和流程，如果你懒得看的话，直接用我写好的 App 修修补补也是可以的，只需要发送消息【AndServer搭建Web服务调so】到公众号【NightTeam】即可。</p>
                  <hr>
                  <p>文章作者：「夜幕团队 NightTeam 」- 妄为 夜幕团队成立于 2019 年，团队成员包括崔庆才、周子淇、陈祥安、唐轶飞、冯威、蔡晋、戴煌金、张冶青和韦世东。 涉猎的主要编程语言为 Python、Rust、C++、Go，领域涵盖爬虫、深度学习、服务研发和对象存储等。团队非正亦非邪，只做认为对的事情，请大家小心。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/NightTeam" class="author" itemprop="url" rel="index">NightTeam</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-20 01:16:37" itemprop="dateCreated datePublished" datetime="2019-09-20T01:16:37+08:00">2019-09-20</time>
                </span>
                <span id="/7544.html" class="post-meta-item leancloud_visitors" data-flag-title="AndServer+Service 打造 Android 服务器实现 so 文件调用" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>6.2k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>6 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7542.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7542.html" class="post-title-link" itemprop="url">如何将协议规范变成开源库系列文章之 WebSocket</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>这是系列文章的第一篇，也是非常重要的一篇，希望大家能读懂我想要表达的意思。</p>
                  <h1 id="系列文章开篇概述"><a href="#系列文章开篇概述" class="headerlink" title="系列文章开篇概述"></a>系列文章开篇概述</h1>
                  <p>相对于其他编程语言来说，Python 生态中最突出的就是第三方库。任何一个及格的 Python 开发者都使用过至少 5 款第三方库。 就爬虫领域而言，必将用到的例如网络请求库 Requests、网页解析库 Parsel 或 BeautifulSoup、数据库对象关系映射 Motor 或 SQLAlchemy、定时任务 Apscheduler、爬虫框架 Scrapy 等。 这些开源库的使用方法想必大家已经非常熟练了，甚至还修炼出了自己的一套技巧，日常工作中敲起键盘肯定也是哒哒哒的响。 但是你有没有想过：</p>
                  <ul>
                    <li>那个神奇的功能是如何实现的？</li>
                    <li>这个功能背后的逻辑是什么？</li>
                    <li>为什么要这样做而不是选择另一种写法？</li>
                    <li>编写这样的库需要用到哪些知识？</li>
                    <li>这个论点是否有明确的依据？</li>
                  </ul>
                  <p><img src="https://user-gold-cdn.xitu.io/2019/9/14/16d2dda24b0853ed?w=312&amp;h=312&amp;f=png&amp;s=127644" alt=""> 如果你从未这样想过，那说明你还没到达应该「渡劫」的时机；如果你曾提出过 3 个以上的疑问，那说明你即将到达那个重要的关口；如果你常常这么想，而且也尝试着寻找对应的答案，那么恭喜你，你现在正处于「渡劫」的关口之上。 <img src="https://user-gold-cdn.xitu.io/2019/9/14/16d2de0cdaef77c5?w=538&amp;h=282&amp;f=png&amp;s=104780" alt=""> 偶有群友会抛出这样的问题：初级工程师、中级工程师、高级工程师如何界定？ 这个问题有两种不同的观点，第一个是看工作职级，第二个则是看个人能力。工作职级是一个浮动很大的参照物，例如阿里巴巴的高级研发和我司的高级研发，职级名称都是「高级研发」，但能力可能会有很大的差距。 个人能力又如何评定呢？ 难不成看代码写的快还是写的慢吗？ 当然不是！ 个人能力应当从广度和深度两个方面进行考量，这并没有一个明确的标准。当两人能力差异很大的时候，外人可以轻松的分辨孰强孰弱。 自己怎样分辨个人能力的进与退呢？ 这就回到了上面提到的那些问题：WHO WHAT WHERE WHY WHEN HOW？ 我想通过这篇文章告诉你，不要做那个用库用得很熟练的人，要做那个创造库的人。计算机世界如此吸引人，就是因为我们可以在这个世界里尽情创造。 你想做一个创造者吗？ 如果不想，那现在你就可以关掉浏览器窗口，回到 Hub 的世界里。</p>
                  <h1 id="内容介绍"><a href="#内容介绍" class="headerlink" title="内容介绍"></a>内容介绍</h1>
                  <p>这是一套系列文章，这个系列将为大家解读常见库（例如 WebSocket、HTTP、ASCII、Base64、MD5、AES、RSA）的协议规范和对应的代码实现，帮助大家「知其然，知其所以然」。</p>
                  <h2 id="目标"><a href="#目标" class="headerlink" title="目标"></a>目标</h2>
                  <p>这次我们要学习的是 WebSocket 协议规范和代码实现，也可以理解为从 0 开始编写 <a href="https://github.com/asyncins/aiowebsocket" target="_blank" rel="noopener">aiowebsocket</a> 库。至于为什么选择它，那大概是因为全世界没有比我更熟悉的它的人了。 我是 aiowebsocket 库的作者，我花了 7 天编写这个库。写库的过程，让我深刻体会到造轮子和驾驶的区别，也让我有了飞速的进步。我希望用连载系列文章的形式帮助大家从驾驶者转换到创造者，拥有「编程思考」。</p>
                  <h2 id="前置条件"><a href="#前置条件" class="headerlink" title="前置条件"></a>前置条件</h2>
                  <p>WebSocket 是一种在单个 TCP 连接上进行全双工通信的协议，它的出现使客户端和服务器之间的数据交换变得更加简单。下图描述了双端交互的流程: WebSocket 通常被应用在实时性要求较高的场景，例如赛事数据、股票证券、网页聊天和在线绘图等。WebSocket 与 HTTP 协议完全不同，但同样被广泛应用。 无论是后端开发者、前端开发者、爬虫工程师或者信息安全工作者，都应该掌握 WebSocket 协议的知识。 我曾经发表过几篇关于 WebSocket 的文章：</p>
                  <ul>
                    <li><a href="https://juejin.im/post/5d4cbc0cf265da038f47fa37" target="_blank" rel="noopener">【严选-高质量文章】开发者必知必会的 WebSocket 协议</a></li>
                    <li><a href="https://juejin.im/post/5c80b768f265da2dae514d4f" target="_blank" rel="noopener">Python如何爬取实时变化的WebSocket数据</a></li>
                    <li><a href="https://juejin.im/post/5c7cdaabf265da2daf79c15f" target="_blank" rel="noopener">WebSocket 从入门到写出开源库</a></li>
                  </ul>
                  <p>其中，《【严选-高质量文章】开发者必知必会的 WebSocket 协议》介绍了协议规范的相关知识。这篇文章的内容大体如下：</p>
                  <ul>
                    <li>WebSocket 协议来源</li>
                    <li>WebSocket 协议的优点</li>
                    <li>WebSocket 协议规范</li>
                    <li>一些实际代码演示</li>
                  </ul>
                  <p>如果没有掌握 WebSocket 协议的朋友，我建议先去阅读这篇文章，尤其是对 <a href="https://juejin.im/post/5d4cbc0cf265da038f47fa37#heading-4" target="_blank" rel="noopener">WebSocket 协议规范</a>介绍的那部分。 要想将协议规范 RFC6455 变成开源库，第一步就是要熟悉整个协议规范，所以你需要阅读<a href="https://juejin.im/post/5d4cbc0cf265da038f47fa37" target="_blank" rel="noopener">【严选-高质量文章】开发者必知必会的 WebSocket 协议</a>。当然，有能力的同学直接阅读 RFC6455 也未尝不可。 接着还需要了解编程语言中内置库 Socket 的基础用法，例如 Python 中的 <a href="https://docs.python.org/3/library/socket.html?highlight=socket#module-socket" target="_blank" rel="noopener">socket</a> 或者更高级更潮的 <a href="https://docs.python.org/3/library/asyncio-stream.html" target="_blank" rel="noopener">Streams</a>、<a href="https://docs.python.org/3/library/asyncio-protocol.html" target="_blank" rel="noopener">Transports and Protocols</a>。如果你是 Go 开发者、Rust 开发者，请查找对应语言的内置库。 假设你已经熟悉了 RFC6455，你应该知道 Frame 打包和解包的时候需要用到位运算，正好我之前写过位运算相关的文章 <a href="https://gitbook.cn/gitchat/activity/5d4d6d8f6f29256fa317e946" target="_blank" rel="noopener">7分钟全面了解位运算</a>。 至于其它的，现用现学吧！</p>
                  <h1 id="Python-网络通信之-Streams"><a href="#Python-网络通信之-Streams" class="headerlink" title="Python 网络通信之 Streams"></a>Python 网络通信之 Streams</h1>
                  <p>WebSocket，也可以理解为在 WEB 应用中使用的 Socket，这意味着本篇将会涉及到 Socket 编程。上面提到，Python 中与 Socket 相关的有 socket、Streams、Transports and Protocols。其中 socket 是同步的，而另外两个是异步的，这俩属于你常听到的 asyncio。</p>
                  <h2 id="Socket-通信过程"><a href="#Socket-通信过程" class="headerlink" title="Socket 通信过程"></a>Socket 通信过程</h2>
                  <p>Socket 是端到端的通信，所以我们要搞清楚消息是怎么从一台机器发送到另一台机器的，这很重要。假设通信的两台机器为 Client 和 Server，Client 向 Server 发送消息的过程如下图所示：</p>
                  <blockquote>
                    <p>Client 通过文件描述符的读写 API read &amp; write 来访问操作系统内核中的网络模块为当前套接字分配的发送 send buffer 和接收 recv buffer 缓存。 Client 进程写消息到内核的发送缓存中，内核将发送缓存中的数据传送到物理硬件 NIC，也就是网络接口芯片 (Network Interface Circuit)。 NIC 负责将翻译出来的模拟信号通过网络硬件传递到服务器硬件的 NIC。 服务器的 NIC 再将模拟信号转成字节数据存放到内核为套接字分配的接收缓存中，最终服务器进程从接收缓存中读取数据即为源客户端进程传递过来的 消息。</p>
                  </blockquote>
                  <p>上述通信过程的描述和图片均出自钱文品的深入理解 RPC 交互流程。 我尝试寻找通信过程中每个步骤的依据（尤其是 send buffer to NIC to recv buffer），（我翻阅了 TCP 的 RFC 和 Kernel.org）但遗憾的是并未找到有力的证明（一定是我太菜了），如果有朋友知道，可以评论告诉我或发邮件 zenrusts@sina.com 告诉我，我可以扩展出另一篇文章。</p>
                  <h2 id="创建-Streams"><a href="#创建-Streams" class="headerlink" title="创建 Streams"></a>创建 Streams</h2>
                  <p>那么问题来了：在 Python 中，我们如何实现端到端的消息发送呢？ 答：Python 提供了一些对象帮助我们实现这个需求，其中相对简单易用的是 Streams。 Streams 是 Python Asynchronous I/O 中提供的 High-level APIs。Python 官方文档对 Streams 的介绍如下：</p>
                  <blockquote>
                    <p>Streams are high-level async/await-ready primitives to work with network connections. Streams allow sending and receiving data without using callbacks or low-level protocols and transports.</p>
                  </blockquote>
                  <p>我尬译一下：Streams 是用于网络连接的 high-level async/await-ready 原语。Streams 允许在不使用回调或 low-level protocols and transports 的情况下发送和接收数据。 Python 提供了 <code>asyncio.open_connection()</code> 让开发者创建 Streams，<code>asyncio.open_connection()</code> 将建立网络连接并返回 reader 和 writer 对象，这两个对象其实是 StreamReader 和 StreamWriter 类的实例。 开发者可以通过 StreamReader 从 IO 流中读取数据，通过 StreamWriter 将数据写入 IO 流。虽然文档并没有给出 IO 流的明确定义，但我猜它跟 buffer （也就是 send buffer to NIC to recv buffer 中的 buffer）有关，你也可以抽象的认为它就是 buffer。 有了 Streams，就有了端到端消息发送的完整实现。下面将通过一个例子来熟悉 Streams 的用法和用途。这是 Python 官方文档给出的双端示例，首先是 Server 端：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># TCP echo server using streams</span></span><br><span class="line"><span class="comment"># 本文出自「夜幕团队 NightTeam」 转载请联系并取得授权</span></span><br><span class="line"><span class="keyword">import</span> asyncio</span><br><span class="line"></span><br><span class="line"><span class="keyword">async</span> <span class="function"><span class="keyword">def</span> <span class="title">handle_echo</span><span class="params">(reader, writer)</span>:</span></span><br><span class="line">    data = <span class="keyword">await</span> reader.read(<span class="number">100</span>)</span><br><span class="line">    message = data.decode()</span><br><span class="line">    addr = writer.get_extra_info(<span class="string">'peername'</span>)</span><br><span class="line"></span><br><span class="line">    print(<span class="string">f"Received <span class="subst">&#123;message!r&#125;</span> from <span class="subst">&#123;addr!r&#125;</span>"</span>)</span><br><span class="line"></span><br><span class="line">    print(<span class="string">f"Send: <span class="subst">&#123;message!r&#125;</span>"</span>)</span><br><span class="line">    writer.write(data)</span><br><span class="line">    <span class="keyword">await</span> writer.drain()</span><br><span class="line"></span><br><span class="line">    print(<span class="string">"Close the connection"</span>)</span><br><span class="line">    writer.close()</span><br><span class="line"></span><br><span class="line"><span class="keyword">async</span> <span class="function"><span class="keyword">def</span> <span class="title">main</span><span class="params">()</span>:</span></span><br><span class="line">    server = <span class="keyword">await</span> asyncio.start_server(</span><br><span class="line">        handle_echo, <span class="string">'127.0.0.1'</span>, <span class="number">8888</span>)</span><br><span class="line"></span><br><span class="line">    addr = server.sockets[<span class="number">0</span>].getsockname()</span><br><span class="line">    print(<span class="string">f'Serving on <span class="subst">&#123;addr&#125;</span>'</span>)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">async</span> <span class="keyword">with</span> server:</span><br><span class="line">        <span class="keyword">await</span> server.serve_forever()</span><br><span class="line"></span><br><span class="line">asyncio.run(main())</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>接着是 Client 端：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># TCP echo client using streams</span></span><br><span class="line"><span class="comment"># 本文出自「夜幕团队 NightTeam」 转载请联系并取得授权</span></span><br><span class="line"><span class="keyword">import</span> asyncio</span><br><span class="line"></span><br><span class="line"><span class="keyword">async</span> <span class="function"><span class="keyword">def</span> <span class="title">tcp_echo_client</span><span class="params">(message)</span>:</span></span><br><span class="line">    reader, writer = <span class="keyword">await</span> asyncio.open_connection(</span><br><span class="line">        <span class="string">'127.0.0.1'</span>, <span class="number">8888</span>)</span><br><span class="line"></span><br><span class="line">    print(<span class="string">f'Send: <span class="subst">&#123;message!r&#125;</span>'</span>)</span><br><span class="line">    writer.write(message.encode())</span><br><span class="line"></span><br><span class="line">    data = <span class="keyword">await</span> reader.read(<span class="number">100</span>)</span><br><span class="line">    print(<span class="string">f'Received: <span class="subst">&#123;data.decode()!r&#125;</span>'</span>)</span><br><span class="line"></span><br><span class="line">    print(<span class="string">'Close the connection'</span>)</span><br><span class="line">    writer.close()</span><br><span class="line"></span><br><span class="line">asyncio.run(tcp_echo_client(<span class="string">'Hello World!'</span>))</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>将示例分别写入到 server.py 和 client.py 中，然后按序运行。此时 server.py 的窗口会输出如下内容：</p>
                  <figure class="highlight livecodeserver">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Serving <span class="keyword">on</span> (<span class="string">'127.0.0.1'</span>, <span class="title">8888</span>)</span><br><span class="line">Received <span class="string">'Hello World!'</span> <span class="built_in">from</span> (<span class="string">'127.0.0.1'</span>, <span class="number">59534</span>)</span><br><span class="line">Send: <span class="string">'Hello World!'</span></span><br><span class="line">Close <span class="keyword">the</span> connection</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>从输出中得知，服务启动的 address 和 port 为 <code>(&#39;127.0.0.1&#39;, 8888)</code>，从 <code>(&#39;127.0.0.1&#39;, 59534)</code> 读取到内容为 <code>Hello World!</code> 的消息，接着将 <code>Hello World!</code> 返回给 <code>(&#39;127.0.0.1&#39;, 59534)</code> ，最后关闭连接。 client.py 的窗口输出内容如下：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="string">Send:</span> <span class="string">'Hello World!'</span></span><br><span class="line"><span class="string">Received:</span> <span class="string">'Hello World!'</span></span><br><span class="line">Close the connection</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在创建连接后，Client 向指定的端发送了内容为 <code>Hello World!</code> 的消息，接着从指定的端接收到内容为 <code>Hello World!</code> 的消息，最后关闭连接。 有些读者可能不太理解，为什么 Client Send <code>Hello World!</code> ，而 Server 接收到之后也向 Client Send <code>Hello World!</code> 。双端的 Send 和 Received 都是 <code>Hello World!</code> ，这很容易让新手懵逼。实际上这就是一个普通的回显服务器示例，也就是说当 Server 收到消息时，将消息内容原封不动的返回给 Client。 这样只是为了演示，并无它意，但这样的示例却会给新手带来困扰。 以上是一个简单的 Socket 编程示例，整体思路理解起来还是很轻松的，接下来我们将逐步解读示例中的代码：</p>
                  <figure class="highlight autohotkey">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">* client.py 中用 `asyncio.open_connection()` 连接指定的端，并获得 reader 和 writer 这两个对象。</span><br><span class="line">* 然后使用 writer 对象中的 `write()` 方法将 `Hello World!` 写入到 IO 流中，该消息会被发送到 Server。</span><br><span class="line">* 接着使用 reader 对象中的 `read()` 方法从 IO 流中读取消息，并将消息打印到终端。</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>看到这里，你或许会有另一个疑问：<code>write()</code> 只是将消息写入到 IO 流，并没有发送行为，那消息是如何传输到 Server 的呢？ 由于无法直接跟进 CPython 源代码，所以我们无法得到确切的结果。但我们可以跟进 Python 代码，得知消息最后传输到 <code>transport.write()</code> ，如果你想知道更多，可以去看 Transports and Protocols 的介绍。你可以将这个过程抽象为上图的 Client to send buffer to NIC to recv buffer to Server。</p>
                  <h1 id="功能模块设计"><a href="#功能模块设计" class="headerlink" title="功能模块设计"></a>功能模块设计</h1>
                  <p>通过上面的学习，现在你已经掌握了 WebSocket 协议规范和 Python Streams 的基本用法，接下来就可以设计一个 WebSocket 客户端库了。 根据 RFC6455 的约定，WebSocket 之前是 HTTP，通过「握手」来升级协议。协议升级后进入真正的 WebSocket 通信，通信包含发送（Send）和接收（Recv）。文本消息要在传输过程前转换为 Frames，而接受端读取到消息后要将 Frames 转换成文本。当然，期间会有一些异常产生，我们可能需要自定义异常，以快速定位问题所在。现在我们得出了几个模块：</p>
                  <figure class="highlight asciidoc">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="bullet">* </span>握手 - ShakeHands</span><br><span class="line"></span><br><span class="line"><span class="bullet">* </span>传输 - Transports</span><br><span class="line"></span><br><span class="line"><span class="bullet">* </span>帧处理 - Frames</span><br><span class="line"></span><br><span class="line"><span class="bullet">* </span>异常 - Exceptions</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>一切准备就绪后，就可以进入真正的编码环节了。 由于实战编码篇幅太长，我决定放到下一期，这期的内容，读者们可能需要花费一些时间吸收。</p>
                  <h1 id="小结"><a href="#小结" class="headerlink" title="小结"></a>小结</h1>
                  <p>开篇我强调了「创造能力」有多么重要，甚至抛出了一些不是很贴切的例子，但我就是想告诉你，不要做调参?。 然后我告诉你，本篇文章要讲解的是 WebSocket。 接着又跟你说，要掌握 WebSocket 协议，如果你无法独立啃完 RFC6455，还可以看我写过的几篇关于 WebSocket 文章和位运算文章。 过了几分钟，给你展示了 Socket 的通信过程，虽然没有强有力的依据，但你可以假设这是对的。 喝了一杯白开水之后，我向你展示了 Streams 的具体用法并为你解读代码的作用，重要的是将 Streams 与 Socket 通信过程进行了抽象。 这些前置条件都确定后，我又带着你草草地设计了 WebSocket 客户端的功能模块。 下一篇文章将进入代码实战环节，请做好环境（Python 3.6+）准备。</p>
                  <h2 id="总之，要想越过前面这座山，就请跟我来！"><a href="#总之，要想越过前面这座山，就请跟我来！" class="headerlink" title="总之，要想越过前面这座山，就请跟我来！"></a>总之，要想越过前面这座山，就请跟我来！</h2>
                  <hr>
                  <p>文章作者：「夜幕团队 NightTeam 」- 韦世东 夜幕团队成立于 2019 年，团队成员包括崔庆才、周子淇、陈祥安、唐轶飞、冯威、蔡晋、戴煌金、张冶青和韦世东。 涉猎的主要编程语言为 Python、Rust、C++、Go，领域涵盖爬虫、深度学习、服务研发和对象存储等。团队非正亦非邪，只做认为对的事情，请大家小心。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/NightTeam" class="author" itemprop="url" rel="index">NightTeam</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-20 01:14:47" itemprop="dateCreated datePublished" datetime="2019-09-20T01:14:47+08:00">2019-09-20</time>
                </span>
                <span id="/7542.html" class="post-meta-item leancloud_visitors" data-flag-title="如何将协议规范变成开源库系列文章之 WebSocket" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>6.5k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>6 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7540.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7540.html" class="post-title-link" itemprop="url">用 Docker 搭建商业级 4G 代理</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>时间过得真快，距离这个系列的上一篇文章《商业级4G代理搭建指南【准备篇】》发布的时间已经过了两个星期了，上个星期由于各种琐事缠身，周二开始就没空写文章了，所以就咕咕咕了。 那么在准备篇中，我们了解了一下搭建 4G 代理所需要的软硬件，也知道了各种选择的优劣势。现在，我们就可以开始实际搭建了，相信大家也是期待已久了。</p>
                  <hr>
                  <h4 id="基本思路"><a href="#基本思路" class="headerlink" title="基本思路"></a>基本思路</h4>
                  <p>从这篇文章的标题中我们可以看出，这一次的搭建方案主要用到的是 Docker，你可能会很好奇，Docker 跟搭建 4G 代理有什么关系吗？ 嗯，关系很大，我们把整件事情梳理一下，先来看看搭建 4G 代理时的基本流程：</p>
                  <ol>
                    <li>调用网卡拨号，拨号成功后会创建一个虚拟网卡。（正常情况下使用这个虚拟网卡就能上网了） <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/1.png?x-oss-process=style/weixin" alt=""></li>
                    <li>在多网卡的情况下，重复第一步，会得到多个虚拟网卡。 <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/2.png?x-oss-process=style/weixin" alt=""></li>
                    <li>启动代理服务器，使其使用虚拟网卡作为出网网卡，并使用接入内网的实体网卡作为入网网卡。 <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/3.png?x-oss-process=style/weixin" alt="使用起来差不多是这样的"></li>
                  </ol>
                  <p>但是呢，有个问题，根据我之前的测试结果来看，目前在 Linux 环境下还没有一个 HTTP 代理服务器是可以做到分别指定出网网卡和入网网卡的，嗯…这就很麻烦了，因为如果我们无法这么做的话，就会出现类似于下面这样的问题：</p>
                  <ol>
                    <li>出网和入网都在虚拟网卡上，使用代理服务器必须要走公网访问。</li>
                    <li>入网为实体网卡，但出网被代理服务器锁定为了某一个，无法利用到多网卡。</li>
                  </ol>
                  <p>嗯…那么不用 HTTP 代理服务器，用那些经常被用来做一些骚操作的 Socks5 代理服务器呢？如果可以指定网卡的话，再用像 Privoxy 之类的工具把 Socks5 代理转成 HTTP 代理就好了。（某知名扶墙软件的 Windows 版本就是这么转的 HTTP 代理） 在经过一番尝试后，我发现虽然有些 Socks5 代理服务器的文档中是说可以指定网卡，但按照说明操作后，似乎并不能直接做到我想要的效果（要么还是锁定在某一个上面、要么上不了网），所以还是存在一些问题的。可能是需要配合路由表设置来进行操作吧，不过我对网络工程的了解不怎么深，搞了几天也没搞出来，于是乎还得想想别的办法。 <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/4.png?x-oss-process=style/weixin_wushuiyin" alt=""> 这时候，我想到了一个东西——Docker，它可以用来解决这个问题！ 因为 Docker 容器被创建后，不管外界的网卡有多少个，容器内部的网卡都只会有一个Docker自己的虚拟网卡（容器间通信用的）和一个本地环回接口（不用管它），而且我们在容器内进行拨号操作时，产生的那个新的虚拟网卡也不会影响到外界或其他容器，这样的话，代理服务器就不需要指定网卡了，直接启动就能跑！ 那么现在整个流程就跑通了，进入实际操作环节看看吧！</p>
                  <hr>
                  <h4 id="系统方面"><a href="#系统方面" class="headerlink" title="系统方面"></a>系统方面</h4>
                  <p>这个 Docker 版的搭建方式，系统方面的选择很多，由于我使用的样例设备是树莓派，所以这里就选择使用了 Raspbian（树莓派专属版 Debian）。如果你使用的是其他设备的话，直接选择一个自己常用的系统就好。 那么准备好之后的第一步当然是先下载并安装 Docker，这里我直接使用 Docker 官方提供的一键安装脚本来进行安装：</p>
                  <figure class="highlight dsconfig">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="string">curl </span>-<span class="string">fsSL </span><span class="string">https:</span>//<span class="string">get.</span><span class="string">docker.</span><span class="string">com </span>-o <span class="built_in">get-docker.sh</span></span><br><span class="line"><span class="built_in">sudo</span> <span class="string">sh </span><span class="built_in">get-docker.sh</span></span><br><span class="line"><span class="built_in">#</span> 出自官方文档：<span class="string">https:</span>//<span class="string">docs.</span><span class="string">docker.</span><span class="string">com/</span><span class="string">install/</span><span class="string">linux/</span><span class="string">docker-ce/</span><span class="string">debian/</span><span class="comment">#install-using-the-convenience-script</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这个一键安装脚本理论上来讲所有 Linux 发行版都可以使用，毕竟已经出来很长时间了，如果不行的话请自行使用搜索引擎查找相关资料。 装好 Docker 之后，你有两个选择：</p>
                  <ol>
                    <li>进入体验模式，了解一下具体操作细节是怎么样的。</li>
                    <li>不看这一段，翻到本文最下方直接使用我写好的轮子。</li>
                  </ol>
                  <h4 id="启动容器"><a href="#启动容器" class="headerlink" title="启动容器"></a>启动容器</h4>
                  <p>体验的话，我们就直接这么启动一个 Docker 容器吧，执行以下命令：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">sudo docker run -it --rm --privileged -p <span class="number">3128</span>:<span class="number">3128</span> ubuntu:<span class="number">18.04</span> bash</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <blockquote>
                    <p>上面这条命令的意思是，启动一个内部系统为 Ubuntu18.04 的容器，并进入容器内部的 Shell 执行 <code>bash</code> 命令，如果退出 bash 就自动销毁容器；然后映射容器内的端口3128到外界，映射出来的外界端口也是3128；最后 <code>privileged</code> 参数是开启特权模式，用于将网卡设备映射进容器内。 如果下载镜像很慢的话，可以搜一下“Docker 加速器”，也可以直接扶墙。</p>
                  </blockquote>
                  <h4 id="测试一下网卡是否正常"><a href="#测试一下网卡是否正常" class="headerlink" title="测试一下网卡是否正常"></a>测试一下网卡是否正常</h4>
                  <p>进入容器内部后，我们可以执行一下 <code>ls /dev/ttyUSB*</code> 看一下网卡有没有正常被识别出来（在容器外也是一样的，因为开了特权模式），如果是和我买的同一款 4G 网卡的话，在只插入一张网卡的情况下你会看到4个 ttyUSB 设备。 <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/5.png?x-oss-process=style/weixin" alt="插入了三张网卡的样子，一共12个 ttyUSB 设备"></p>
                  <blockquote>
                    <p>不同 4G 网卡和硬件组合可能会有差异，请以实际情况为准。</p>
                  </blockquote>
                  <p>如果你可以看到<code>4✖4G网卡个数</code>个 ttyUSB 设备的话，就说明没有问题，可以开始下一步了。</p>
                  <h4 id="拨号上网"><a href="#拨号上网" class="headerlink" title="拨号上网"></a>拨号上网</h4>
                  <p>接下来要做的就是拨号了，拨号方面可以选择使用 Wvdial 这种工具，也可以选择使用像 Fanconn 这样的商家提供的拨号脚本（直接调用 PPPD），使用起来的效果会有一些区别。如果商家没有提供拨号脚本的话，就用 Wvdial 吧，它能自动生成配置，上手即用。 我这边的话，由于 Fanconn 的技术人员直接提供了个拨号脚本，那我就用这个脚本了，Wvdial 的文档网上有很多很详尽的，这里就不再多提，需要的朋友自行搜索即可。 如果你用的是 Fanconn 的这个拨号脚本（怎么弄进容器内就不用我说了吧？），那么直接在 <code>apt install ppp</code> 安装好拨号工具之后，用 <code>chmod +x quectel-pppd.sh</code> 给拨号脚本加个运行权限，然后 <code>./quectel-pppd.sh /dev/ttyUSB3</code> 即可。</p>
                  <blockquote>
                    <p>拨号时使用的 <code>/dev/ttyUSB3</code> 是指 4G 网卡的第四个通信端口，文档中的解释为：ttyUSB3→For PPP connections or AT command communication，翻译一下就是用于 PPP 连接或 AT 命令通信。</p>
                  </blockquote>
                  <p>拨号之后用 ifconfig 之类的工具即可看到类似下图中的状态： <img src="https://oss.crawler-lab.com/%E5%95%86%E4%B8%9A%E7%BA%A74G%E4%BB%A3%E7%90%86%E6%90%AD%E5%BB%BA%E6%8C%87%E5%8D%97%E3%80%90%E6%90%AD%E5%BB%BA%E7%AF%87%E4%B9%8BDocker%E7%89%88%E3%80%91/asserts/6.png?x-oss-process=style/weixin" alt=""> 可以看到，如前文所述，现在有三个网卡，一个是 Docker 自己的、一个是本地环回接口（这个不用管）、一个是拨号产生的虚拟网卡。</p>
                  <blockquote>
                    <p>如果不是在 Docker 容器内使用的话，还会有个 wwan0（或其他名字），那个是 4G 网卡本体。</p>
                  </blockquote>
                  <h4 id="测试是否能正常上网"><a href="#测试是否能正常上网" class="headerlink" title="测试是否能正常上网"></a>测试是否能正常上网</h4>
                  <p>现在如果你用 curl 的 <code>\--interface</code> 参数指定虚拟网卡进行请求的话（如：<code>curl --interface ppp0 https://ip.cn</code>），是已经可以请求成功的了，IP 也会是你所使用的 SIM 卡对应的运营商分配的。</p>
                  <blockquote>
                    <p>由于 Docker 的镜像通常都是极度精简的，所以 Ubuntu 镜像里并没有预装像 net-tools、iputils-ping、vim、curl 之类的这些包，需要自行安装。所以如果你发现 ifconfig、ping、curl、vim 用不了，不要惊慌，这是正常现象，执行 <code>apt install 包名</code> 命令安装即可。</p>
                  </blockquote>
                  <p>如果你无法直接请求成功的话，就可能是 DNS 解析出问题了，可以尝试 ping 一个公网 IP（如：<code>ping 1.1.1.1</code>）和一个域名（如：<code>ping ip.cn</code>），如果 IP 能 ping 通但域名会报 DNS 解析失败的话，就可以确认是 DNS 设置问题了。 4G 拨号时如果出现 DNS 设置问题，通常是因为拨号工具没有正常地将运营商返回的 DNS 服务器设置写入到配置中，我们可以手动配置一下（你要强制指定某一个 DNS 也可以）：</p>
                  <figure class="highlight lsl">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"># 以下为阿里云的公共DNS</span><br><span class="line">echo 'nameserver <span class="number">223.5</span><span class="number">.5</span><span class="number">.5</span>' &gt;&gt; /etc/resolv.conf</span><br><span class="line">echo 'nameserver <span class="number">223.6</span><span class="number">.6</span><span class="number">.6</span>' &gt;&gt; /etc/resolv.conf</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <blockquote>
                    <p>在 Docker 容器中，这个 <code>/etc/resolv.conf</code> 文件可能还会有两条内容，是容器本身所需要的，建议不要删除/覆盖，否则会出现容器间无法使用容器名互相通信的情况。</p>
                  </blockquote>
                  <h4 id="启动代理服务器"><a href="#启动代理服务器" class="headerlink" title="启动代理服务器"></a>启动代理服务器</h4>
                  <p>那么在测试拨号后确实可以通过 4G 网卡上网了之后，我们就可以把代理服务器启动了，这里我使用的是 TinyProxy。</p>
                  <blockquote>
                    <p>测试发现，Squid 对资源的占用更大一些，不利于多网卡情况下的使用，会影响到 4G 网卡的数量上限。</p>
                  </blockquote>
                  <p>先 <code>apt install tinyproxy</code> 一波，然后 <code>vim /etc/tinyproxy/tinyproxy.conf</code> 修改一下配置。 要修改的配置主要有：</p>
                  <ul>
                    <li>Port 配置项改为3128，因为我们前面映射出来的端口是3128。</li>
                    <li>Listen 配置项改为0.0.0.0，因为我们需要在其他设备上使用这个代理服务器。</li>
                    <li>Allow 配置项注释掉或改为0.0.0.0/0，默认的127.0.0.1会导致其他设备无法访问。</li>
                  </ul>
                  <p>改完之后保存一波，然后就可以直接执行 <code>tinyproxy</code> 启动了…吗？ 等等，还有一个操作要做！那就是将默认路由指向到虚拟网卡上，很简单，执行以下命令即可：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">route del -net 0.0.0.0 eth0</span><br><span class="line">route <span class="builtin-name">add</span> -net 0.0.0.0 ppp0</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这两条命令的意思是：先将默认的、指向 eth0 这个网卡的上网路由删除，然后添加一个同样的、指向 ppp0 这个网卡的路由。 改完默认路由后的效果就是，即使你不使用 curl 的 <code>\--interface</code> 参数，也能直接使用 4G 网卡上网了。</p>
                  <blockquote>
                    <p>如果没有改默认路由的话，在不指定网卡的情况下，4G 网卡并不会被使用到，因为默认路由指向的是 Docker 自身的虚拟网卡，那个网卡通向你原本的内网环境。也就是说，IP 不会变！</p>
                  </blockquote>
                  <p>那么现在，你可以执行 <code>tinyproxy</code> 启动代理服务器了。</p>
                  <h4 id="测试代理服务器"><a href="#测试代理服务器" class="headerlink" title="测试代理服务器"></a>测试代理服务器</h4>
                  <p>好了，代理服务器应该已经正常启动了，现在我们可以在另一个设备上尝试连接那个容器中的代理服务器，看看是否能正常通过它使用 4G 网卡上网。 例如我这里树莓派分配到的IP是：<code>192.168.137.66</code>，那么我就可以用这样的 curl 命令或 Python 代码进行测试： curl：</p>
                  <figure class="highlight gml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">curl <span class="string">"https://ip.cn"</span></span><br><span class="line">curl -<span class="symbol">x</span> <span class="string">"192.168.137.66:3128"</span> <span class="string">"https://ip.cn"</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>Python：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line">resp = requests.<span class="builtin-name">get</span>(<span class="string">"https://ip.cn"</span>, proxies=&#123;<span class="string">"https"</span>: <span class="string">"http://192.168.137.66:3128"</span>&#125;)</span><br><span class="line">no_proxy_resp = requests.<span class="builtin-name">get</span>(<span class="string">"https://ip.cn"</span>)</span><br><span class="line"><span class="builtin-name">print</span>(resp.text)</span><br><span class="line"><span class="builtin-name">print</span>(no_proxy_resp.text)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>测试出来的结果应该与前面在容器内部测试时的一致，在使用代理后 IP 就变成了运营商分配的基站 IP。</p>
                  <h4 id="更换-IP"><a href="#更换-IP" class="headerlink" title="更换 IP"></a>更换 IP</h4>
                  <p>那么最核心的问题来了，怎么更换 IP 呢？ 其实和使用那些拨号 VPS 架设代理服务器一样，我们只需要重新拨个号就能换 IP 了，直接 kill 掉 pppd 进程就可以让它断开拨号，断开后重新执行一遍拨号脚本就是重新拨号了。</p>
                  <blockquote>
                    <p>断开拨号方面 Fanconn 的技术人员也提供了一个脚本，同样在 <code>chmod +x quectel-ppp-kill</code> 赋予运行权限之后，执行 <code>./quectel-ppp-kill</code> 就可以了。</p>
                  </blockquote>
                  <p>但需要注意的是，蜂窝网络的拨号在断开后，IP 仍然会保留一段时间（具体多久不清楚，可能跟连接的基站也有关系），所以我们需要强制性地让网卡重新搜网。</p>
                  <blockquote>
                    <p>冷门小知识：手机上开启关闭飞行模式的效果就是重新搜网，通常只是关闭“移动数据”的话，效果是与断开拨号一致的。</p>
                  </blockquote>
                  <p>怎么做呢？很简单，就两行命令：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">AT+<span class="attribute">CFUN</span>=0</span><br><span class="line">AT+<span class="attribute">CFUN</span>=1</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>但注意哦，这是 AT 命令，不是 Linux 下的 Shell 命令，AT 命令是一种调制解调器命令语言，我们如果需要将它执行起来，需要这么做：</p>
                  <figure class="highlight jboss-cli">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">echo</span> <span class="string">"AT+CFUN=0"</span> &gt; <span class="string">/dev/ttyUSB2</span></span><br><span class="line"><span class="comment"># 中间间隔1秒左右</span></span><br><span class="line"><span class="keyword">echo</span> <span class="string">"AT+CFUN=1"</span> &gt; <span class="string">/dev/ttyUSB2</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <blockquote>
                    <p>这里使用的 <code>/dev/ttyUSB2</code> 是指 4G 网卡的第三个通信端口，文档中的解释为：ttyUSB2→For AT command communication，与第四个通信端口类似，只是它不能用于 PPP 连接、只能用于 AT 命令通信而已。 不同样使用第四个通信端口的原因是那个端口有被占用的可能性，直接区分开最稳妥，本来网卡也就是提供了两个 AT 命令通信渠道的。</p>
                  </blockquote>
                  <p>在使网卡重新搜网后的几秒至十几/几十秒内的时间里，你无法正常拨号，需要等待它初始化完成后才可以拨号成功，具体等待时间以信号强度为准，我测试的时候通常5秒以内就可以了。 所以如果你在断开后一直拨号失败，不妨过一会儿再试。</p>
                  <hr>
                  <h4 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h4>
                  <p>那么现在操作流程也跑通了，我们也了解到了整个的内部细节，最后要做的就是把每个网卡都分别分配一个容器，这样我们就能实现文章开头所提到的——“使用虚拟网卡作为出网网卡，并使用接入内网的实体网卡作为入网网卡”的效果了。 实际操作起来的话，就是把指定网卡的部分给配置化，然后在启动容器的时候传入就好了，使用 Docker 的容器环境变量相关设置可以很轻松地实现这个功能。 最后，我们可以以这个思路，构建一个 docker-compose 模板，模板的核心内容一是做个简易的4G网卡容器集群，二是启动个 Squid，用来聚合代理服务器，这样我们使用的时候只需要指定一个代理服务器就能随机更换了，操作起来更加方便。</p>
                  <hr>
                  <p>好了，上面就是 Docker 版搭建方式的思路和整个的搭建流程，如果你懒得看的话，直接用我写好的轮子也是可以的，只需要发送消息【Docker版4G代理】到公众号【NightTeam】即可。</p>
                  <h4 id="评价"><a href="#评价" class="headerlink" title="评价"></a>评价</h4>
                  <p>最后的最后，我给这个搭建方式打个评价吧。 这个搭建方式并不完美，因为变量太多，而且很多地方肯定不如系统级原生支持的那么稳定，长期使用可能会出现各种奇奇怪怪的问题。 然后 Docker 的资源占用其实挺高的，会浪费相当多的内存在启动容器上，如果只是两三个网卡还好，如果数量大一点的话，像树莓派2B 这种小内存的设备根本就扛不住。 另外代理服务器本身对资源的消耗也是比较高的，高频调用下对树莓派2B 的小 CPU 压力还是蛮大的，即使我对它的 CPU 进行了超频，在并发测试时也还是会出现轻松打满 CPU 的情况。 <strong>但是！截止目前，我还有两种基于路由器系统的搭建方案没写出来！所以…敬请期待后续的其他搭建方案（斜眼笑）。</strong></p>
                  <hr>
                  <p>文章作者：「夜幕团队 NightTeam」 - Loco 夜幕团队成立于 2019 年，团队包括崔庆才、周子淇、陈祥安、唐轶飞、冯威、蔡晋、戴煌金、张冶青和韦世东。 涉猎的编程语言包括但不限于 Python、Rust、C++、Go，领域涵盖爬虫、深度学习、服务研发、对象存储等。团队非正亦非邪，只做认为对的事情，请大家小心。 <img src="https://user-gold-cdn.xitu.io/2019/9/11/16d205d3a2367c2b?w=900&amp;h=383&amp;f=jpeg&amp;s=32674" alt=""></p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/NightTeam" class="author" itemprop="url" rel="index">NightTeam</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-20 01:12:35" itemprop="dateCreated datePublished" datetime="2019-09-20T01:12:35+08:00">2019-09-20</time>
                </span>
                <span id="/7540.html" class="post-meta-item leancloud_visitors" data-flag-title="用 Docker 搭建商业级 4G 代理" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>5.9k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>5 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7463.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7463.html" class="post-title-link" itemprop="url">谈一谈博客的关注解锁文章功能</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>在这个互联网时代，拥有流量就仿佛于拥有了一切。 我大约在 2014 年底开了自己的个人博客，当时就是想自己记录点学习总结，一个是方便查阅，二是锻炼一下自己写总结或者文章的能力，最初就是记录一些日常生活、编程学习的小知识点什么的。 一次偶然的机会我接触了爬虫，当时用 Python 写爬虫的仿佛也不多，正好有一位学长有研究，我也就跟着他学了起来，学的时候也是自己总结，然后把一些文章发表到博客上，累积了十几篇左右。不知道是什么原因，渐渐地好像爬虫火了起来，Python 也火了起来，不知不觉地我发现我的博客慢慢地流量涨起来了，一天几百、几千一直到现在上万的浏览量，SEO 也逐渐好了起来，说实话我当时都没有想到，感觉还是不少运气成分在里面的。 两年前左右我开了一个公众号，开始在公众号上面发一些文章，自己也逐渐从博客转战到公众号上面了，因为公众号的环境总体来说还是很不错的，尤其是对原创作者来说非常友好，非常尊重原创。转载文章需要开白、原创命中和保护机制、洗稿检测、及时的投诉处理让越来越多的技术人员也转到公众号上来了。所以越来越多的技术开发者都拥有了自己的公众号，变成了一个人人公号的时代。这导致了一个什么结果？竞争日益激烈，读者的可选择范围太多了，大家的涨粉之路也走得越来越艰辛了。 同样地我也遭受着同样的苦恼，这时候我突然想起来，我似乎还有个博客呢？最近写文都专注于公众号，没太有心思打理自己博客了。我在想要是能够把我的博客流量转换到我的公众号上来该多好呢？一来我的博客读者可以关注到我的公号平时发的文章或通知，二来也着实能为自己的公号涨一点粉丝，这样该多好啊？ 思来想去我想到了一个法子，就是在浏览博客文章的时候，把后续内容的隐藏，留一个二维码，可以通过关注公众号解锁。 当时设想效果图就是这样子的： <img src="https://qiniu.cuiqingcai.com/2019-09-14-154546.png" alt="image-20190914224503383"> 文章在某个位置会渐变隐藏，同时浮现一个公众号的样子，需要扫码才能解锁。这时候读者扫码自动关注了公号，博客文章也自然而然地解锁，这样博客的读者就自然关注到公号上面来了。</p>
                  <h2 id="功能要点"><a href="#功能要点" class="headerlink" title="功能要点"></a>功能要点</h2>
                  <p>一听到这样的法子，大家肯定就骂起来了，文章还要解锁来看？每篇文章都要解锁一遍吗？以后如果再打开还需要次次解锁吗？ 如果真的是这样，那我情愿不做这个功能，因为这太损伤「用户体验」了。为了尽量减少用户体验的损失，这个功能必须要满足以下几点：</p>
                  <ul>
                    <li>不要添加用户登录注册机制，一旦增加了这个机制，流程可能会大大复杂化，导致用户体验急剧下降。</li>
                    <li>不能每打开一个页面都要解锁一次，读者访问了我的博客，只需要一次解锁，即可全面解锁博客所有文章。</li>
                    <li>读者在关闭浏览器再重新打开浏览器浏览博客的时候，同样不能让读者再解锁一遍，要直接可看。</li>
                    <li>读者在手机或其他移动设备上不方便操作，手机站点禁止启用本功能。</li>
                  </ul>
                  <p>如果满足了这些条件，读者在一篇文章里面只要扫码解锁了一次，那么就可以永久解锁全站文章了，没有繁琐的登录注册功能，也不需要次次频繁解锁，这样用户体验就非常好了。 为了达成这个目的，我就开始开发这个功能了。</p>
                  <h2 id="识别用户"><a href="#识别用户" class="headerlink" title="识别用户"></a>识别用户</h2>
                  <p>那么怎么来实现呢？要实现上面的功能，其实最重要的就是来识别是哪一个用户，也就是说，我怎么知道到底是谁在浏览我的博客呢？我怎么来专门针对这个用户解锁呢？ 有的同学可能说那就用 IP 地址呗，技术角度是可以实现的，但是其实仔细想想，用 IP 地址是很不友好的。一来是很多用户可能都是内网的 IP 地址，多个公户共享一个公网 IP 地址，所以假如两台设备接入了同一个公网 IP，我是无法判断到底解锁哪一台设备的。二来是，如果一个用户换了其他的地方或者用了 VPN，IP 地址变了，原本解锁的设备又变成非解锁状态了。这样也不好。 那么最方便简单的用来标识一个浏览设备的东西是什么？当然是 Cookies。Cookies 里面保存了浏览网页时自动生成的 Session ID，而且每一个用户都是不一样的，这样不就可以来唯一标识一台浏览设备了吗？</p>
                  <h2 id="解锁逻辑"><a href="#解锁逻辑" class="headerlink" title="解锁逻辑"></a>解锁逻辑</h2>
                  <p>好，那有了用户的 ID，我怎么才能把用户 ID 和我的公众号关联起来呢？当然是把这个 ID 发到公众号后台，我来存起来就好了。然后博客这边定时检测我这边有没有把这个 ID 保存，如果保存了，那就呈现解锁状态，如果没有保存，那就呈现非解锁状态。 最开始我就设想，既然公众号要扫码关注，那么我能不能把这个 ID 也糅到二维码里面呢？这样关注公众号的时候既能查询到公众号，有传递过来一个 ID 作为参数，然后后台处理一下存起来就好了。 你别说还真有这个功能，我在微信平台官方文档里面查询到了一个「生成带参数的公众号二维码」，生成的二维码里面可以指定任意的参数，然后生成的二维码图案就是公众号的二维码，然后处理一下关注公众号的回调函数就可以执行某一些操作了。看到之后我就想起来了很多关注公众号自动登录的功能就是这么做的。 但是经过一系列操作，发现了一个很悲伤的事情，只有服务号才有这个功能，我一小小的订阅号，是没有这个权限的，不能生产带参数的二维码。哎，难道凉了吗？ 不，没有，既然这个参数不能通过二维码传递，那就只好麻烦读者手动把这 ID 输入到我的公众号了，我的小小的订阅号还是有处理消息的功能的。我的公众号后台接收到消息，然后处理下这个消息 ID，然后存起来，那不就好了吗？ 说干就干！</p>
                  <h2 id="隐藏文章"><a href="#隐藏文章" class="headerlink" title="隐藏文章"></a>隐藏文章</h2>
                  <p>怎么开始做呢？那就从隐藏文章开始做吧。首先这个隐藏不能是真正的后台的隐藏，需要在前台隐藏。如果是后台隐藏的话，搜索引擎所能爬到的我的网站内容就会缺失了，会影响 SEO 的。所以只需要前台 CSS 隐藏一下就好了。 怎样看起来隐藏得比较自然呢？就取文章的的一半的地方，把文章的下面部分用 CSS 藏起来，然后加个渐变效果就好了。 比如要隐藏一半的内容吧，首先可以获取文章区块的高度，然后把文章页面高度用 CSS 强制设置为原来的一半就好了，这个很好操作，然后再在最底下加个渐变的样子，仿佛底下还有文字的样子。 这个 CSS 用 background 属性就能实现了，参考代码如下：</p>
                  <figure class="highlight css">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-id">#locker</span> &#123;</span><br><span class="line">        <span class="attribute">height</span>: <span class="number">240px</span>;</span><br><span class="line">    <span class="attribute">width</span>: <span class="number">100%</span>;</span><br><span class="line">    <span class="attribute">background</span>: <span class="built_in">-webkit-gradient</span>(linear, <span class="number">0</span> top, <span class="number">0</span> bottom, from(rgba(<span class="number">255</span>, <span class="number">255</span>, <span class="number">255</span>, <span class="number">0</span>)), <span class="built_in">to</span>(#fff));</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里就是设置个 240 像素的区块，然后从上面到下面是透明度渐变颜色就好了，整体效果是下面这个样子： <img src="https://qiniu.cuiqingcai.com/2019-09-14-151650.png" alt="image-20190914231647956"> 好，既然隐藏了，那么下面就加个提示吧，把公众号的二维码先放上，然后把那个 Session ID 放上，提示用户关注公众号后发送这个 ID 就能解锁了，但这个 ID 又不能太长，多少呢？六位吧那就。 类似做成这样的样子： <img src="https://qiniu.cuiqingcai.com/2019-09-14-151807.png" alt="image-20190914231805506"> 好，那么这个 ID 怎么获取的呢？ 刚才说了，从 Cookies 里面获取就行了，找那个能够标识 Session ID 的一个 Cookies 字段，然后摘取其值的其中几位就行了，摘取的位置也有讲究，前几位仿佛重复率很高的样子，后面几位几乎不重复，那就截取最后六位数字吧。 好，然后我就在博客里面加了这么一点 JavaScript 代码来实现这个 ID 的提取：</p>
                  <figure class="highlight xquery">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">function</span> getCookie<span class="built_in">(name</span>) &#123;</span><br><span class="line">  var <span class="keyword">value</span> = <span class="string">"; "</span> +<span class="built_in"> document</span>.cookie;</span><br><span class="line">  var parts = <span class="keyword">value</span>.split(<span class="string">"; "</span> +<span class="built_in"> name</span> + <span class="string">"="</span>);</span><br><span class="line">  <span class="keyword">if</span> (parts.length == <span class="number">2</span>) <span class="keyword">return</span> parts.pop().split(<span class="string">";"</span>).shift();</span><br><span class="line">&#125;</span><br><span class="line"></span><br><span class="line"><span class="keyword">function</span> getToken() &#123;</span><br><span class="line">  <span class="keyword">let</span> <span class="keyword">value</span> = getCookie(<span class="string">'UM_distinctid'</span>)</span><br><span class="line">  <span class="keyword">if</span> (!<span class="keyword">value</span>) &#123;</span><br><span class="line">    <span class="keyword">return</span> defaultToken</span><br><span class="line">  &#125;</span><br><span class="line">  <span class="keyword">return</span> <span class="keyword">value</span><span class="built_in">.substring</span>(<span class="keyword">value</span>.length - <span class="number">6</span>).toUpperCase()</span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里 getCookie 方法是用某个名字获取一个 Cookies 字段，getToken 方法是截取了 Cookies 这个字段值的后六位并做了大写处理。 这里我的一个可以用来标识 Session ID 的 Cookies 字段叫做 UM_distinctid，就用它了。 这样一来，每个用户浏览的时候就能生成这样的一个 ID 了，六位的。 胜利似乎越来越近了。</p>
                  <h2 id="持久化存储"><a href="#持久化存储" class="headerlink" title="持久化存储"></a>持久化存储</h2>
                  <p>这里就又遇到一个问题，刚才不是说还要在用户关闭浏览器之后再重新打开，依然能保持解锁状态吗？这就要求这个 ID 在用户关闭又打开浏览器的时候是不变的。 这个怎么解？很简单，反正已经是从 Cookies 里面读了，这个 Cookies 持久化就行了，只要不在浏览器关闭后清除就行了，怎么办？设置个过期时间就好。 由于我的站点是 WordPress 做的，所以这个功能自动有了，如果没有的话用一些插件也能实现的。</p>
                  <h2 id="公众号处理"><a href="#公众号处理" class="headerlink" title="公众号处理"></a>公众号处理</h2>
                  <p>好，现在 ID 也有了，用户扫码把这个 ID 发到公众号后台就行了吧，然后公众号对接开发者模式处理一下就好了。 这里就其实就很简单了，其实仅仅就是把用户的 OpenID 和这个码存到了一个数据库里面。我后台是用 Django 写的，所以用了 Django 里面的 Model，实现逻辑如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">unlock</span><span class="params">(source, target, content)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    解锁博客</span></span><br><span class="line"><span class="string">    :param target: 微信平台</span></span><br><span class="line"><span class="string">    :param source: 用户</span></span><br><span class="line"><span class="string">    :param content: 用户发来的码</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    Unlock.objects.get_or_create(openid=source, token=content.upper())</span><br><span class="line">    <span class="keyword">return</span> reply_text(source, target, <span class="string">'恭喜您已经解锁博客全部文章~'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>就是这么两行，插入了一条数据，然后返回了一个信息提示。 插入之后怎么办呢？博客得知道我已经把这条数据插入进来了呀？那就再提供一个 API 查询吧，实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">is_locked</span><span class="params">(request)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    判断是否已经解锁</span></span><br><span class="line"><span class="string">    :param request: 包含token的请求</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    token = request.GET.get(<span class="string">'token'</span>)</span><br><span class="line">    result = Unlock.objects.filter(token=token.upper()).first()</span><br><span class="line">    <span class="keyword">return</span> JsonResponse(&#123;</span><br><span class="line">        <span class="string">'locked'</span>: <span class="literal">False</span> <span class="keyword">if</span> result <span class="keyword">else</span> <span class="literal">True</span></span><br><span class="line">    &#125;)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>把这个方法对接一个 API 接口，比如 <code>/api/locked?token=xxxxx</code>，就可以知道是否解锁了。 所以，在公众号后台我就用开发者模式对接了这么两个功能，一个用来存，一个用来查。只要用户发送了这个能够用来表示自己浏览设备的码，我就存下来，然后博客定时请求这个 API 查询状态，如果返回结果是未解锁状态，那就继续锁住，如果是解锁状态。那就把博客解开。</p>
                  <h2 id="博客端处理"><a href="#博客端处理" class="headerlink" title="博客端处理"></a>博客端处理</h2>
                  <p>那么博客端具体怎么来处理呢？就基本的轮询就好了，定时几秒查一次 API，然后把这个码当做参数传过去，然后根据查询结果执行解锁或非解锁操作就好了。 核心代码如下：</p>
                  <figure class="highlight arcade">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">(articleSelector).ready(<span class="function"><span class="keyword">function</span> (<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="keyword">var</span> articleElement = $(articleSelector)[<span class="number">0</span>]</span><br><span class="line">  <span class="keyword">if</span> (articleElement) &#123;</span><br><span class="line">    <span class="keyword">var</span> height = articleElement.clientHeight</span><br><span class="line">    <span class="keyword">var</span> halfHeight = height * <span class="number">0.3</span></span><br><span class="line">    <span class="keyword">var</span> token = getToken()</span><br><span class="line">    $(<span class="string">'#locker'</span>).find(<span class="string">'.token'</span>).text(token)</span><br><span class="line">    <span class="function"><span class="keyword">function</span> <span class="title">detect</span>(<span class="params"></span>) </span>&#123;</span><br><span class="line">      $.ajax(&#123;</span><br><span class="line">        url: <span class="string">'https://weixin.cuiqingcai.com/api/locked/'</span>,</span><br><span class="line">        method: <span class="string">'GET'</span>,</span><br><span class="line">        data: &#123;</span><br><span class="line">          token: token</span><br><span class="line">        &#125;,</span><br><span class="line">        success: <span class="function"><span class="keyword">function</span> (<span class="params">data</span>) </span>&#123;</span><br><span class="line">          <span class="keyword">if</span> (data.locked === <span class="literal">true</span> || data.locked === <span class="literal">false</span>) &#123;</span><br><span class="line">            locked = data.locked</span><br><span class="line">          &#125;</span><br><span class="line">        &#125;,</span><br><span class="line">        error: <span class="function"><span class="keyword">function</span> (<span class="params">data</span>) </span>&#123;</span><br><span class="line">          locked = <span class="literal">false</span></span><br><span class="line">        &#125;</span><br><span class="line">      &#125;)</span><br><span class="line">    &#125;</span><br><span class="line">  &#125;</span><br><span class="line">&#125;)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里就用基本的 jQuery 实现的，其实就是调了个 Ajax，也没啥高深的技巧。这里唯一值得注意的一点设计就是，如果 API 请求失败，这基本上证明我的 API 服务挂掉了，这里就需要把 locked 设置为 false，证明为解锁状态。这样，万一我的 API 后台挂了，博客会直接是解锁状态，这样就避免了读者永远无法解锁了。这是一个细节上的设计。 至此，一些技术上的问题就基本解决了。</p>
                  <h2 id="手机端处理"><a href="#手机端处理" class="headerlink" title="手机端处理"></a>手机端处理</h2>
                  <p>最后回过头来看看，那个需求还没有满足？ 读者在手机或其他移动设备上不方便操作，手机站点禁止启用本功能。那么怎么实现呢？很简单，判断一下浏览器的 User-Agent 就好了，这里实现了一个判断是否是 PC 的方法：</p>
                  <figure class="highlight arcade">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">var</span> os = <span class="function"><span class="keyword">function</span> (<span class="params"></span>) </span>&#123;</span><br><span class="line">  <span class="keyword">var</span> ua = navigator.userAgent,</span><br><span class="line">    isWindowsPhone = <span class="regexp">/(?:Windows Phone)/</span>.test(ua),</span><br><span class="line">    isSymbian = <span class="regexp">/(?:SymbianOS)/</span>.test(ua) || isWindowsPhone,</span><br><span class="line">    isAndroid = <span class="regexp">/(?:Android)/</span>.test(ua),</span><br><span class="line">    isFireFox = <span class="regexp">/(?:Firefox)/</span>.test(ua),</span><br><span class="line">    isChrome = <span class="regexp">/(?:Chrome|CriOS)/</span>.test(ua),</span><br><span class="line">    isTablet = <span class="regexp">/(?:iPad|PlayBook)/</span>.test(ua) || (isAndroid &amp;&amp; !<span class="regexp">/(?:Mobile)/</span>.test(ua)) || (isFireFox &amp;&amp; <span class="regexp">/(?:Tablet)/</span>.test(ua)),</span><br><span class="line">    isPhone = <span class="regexp">/(?:iPhone)/</span>.test(ua) &amp;&amp; !isTablet,</span><br><span class="line">    isPc = !isPhone &amp;&amp; !isAndroid &amp;&amp; !isSymbian;</span><br><span class="line">  <span class="keyword">return</span> &#123;</span><br><span class="line">    isTablet: isTablet,</span><br><span class="line">    isPhone: isPhone,</span><br><span class="line">    isAndroid: isAndroid,</span><br><span class="line">    isPc: isPc</span><br><span class="line">  &#125;</span><br><span class="line">&#125;()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样一来，调用 os.isPC 就可以知道当前浏览器是不是手机浏览器了。 在处理的时候加上这个条件判断，就可以实现手机功能的解除了。</p>
                  <h2 id="效果"><a href="#效果" class="headerlink" title="效果"></a>效果</h2>
                  <p>可能大家想知道效果是如何的，这里就截图看看了，现在这个功能已经在我的博客 cuiqingcai.com 上线了，大家可以进去体验一下。 首先进去文章是这个样子的： <img src="https://qiniu.cuiqingcai.com/2019-09-14-154238.png" alt="image-20190914234237147"> 然后关注了公号，发送了代码： <img src="https://qiniu.cuiqingcai.com/2019-09-14-154422.png" alt="image-20190914234421831"> 发送完毕之后，大约一两秒之后，抬头看看博客，就是这个样子了： <img src="https://qiniu.cuiqingcai.com/2019-09-14-154408.png" alt="image-20190914234406442"> 这已经就完成了解锁和转化，读者可以全站永久解锁我的博客文章，我也增长了粉丝。 现在过一段时间就会有读者发来代码解锁，同时成为了我的粉丝，订阅号助手看到消息如下： <img src="https://qiniu.cuiqingcai.com/2019-09-14-154721.png" alt="image-20190914234720018"> 以上便是这个博客转化的思路分享和实现，大家也可以到我的博客体验一下，谢谢！</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-15 00:00:19" itemprop="dateCreated datePublished" datetime="2019-09-15T00:00:19+08:00">2019-09-15</time>
                </span>
                <span id="/7463.html" class="post-meta-item leancloud_visitors" data-flag-title="谈一谈博客的关注解锁文章功能" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>6.2k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>6 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7447.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7447.html" class="post-title-link" itemprop="url">今天，大佬云集的夜幕团队正式成立了！</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>今天这篇文章是要告诉你，业内多名实力强劲的开发者组建了一个服务于广大开发者的团队。现在，你可能会有下面这些疑问：</p>
                  <ul>
                    <li>为什么要组成团队呢？</li>
                    <li>这个团队将会给广大开发者带来什么？</li>
                    <li>团队成员有哪些？</li>
                  </ul>
                  <p>好了，接下来用 3 分钟时间去了解这个团队吧！</p>
                  <h3 id="夜幕团队简介"><a href="#夜幕团队简介" class="headerlink" title="夜幕团队简介"></a>夜幕团队简介</h3>
                  <p>夜幕团队 NightTeam 于 2019 年 9 月 9 日正式成立，团队由爬虫领域中实力强劲的多名开发者组成：崔庆才、周子淇、陈祥安、唐轶飞、冯威、蔡晋、戴煌金、张冶青和韦世东。 NightTeam 涉猎的编程语言包括但不限于 Python、Rust、C++、Go，领域涵盖爬虫、深度学习、服务研发和对象存储等，团队技术实力十分雄厚。</p>
                  <p><img src="https://pic1.zhimg.com/80/v2-61dd30339bf8ce2bbe775c51b1339d44_hd.jpg" alt=""></p>
                  <h3 id="为什么要组建一个团队？"><a href="#为什么要组建一个团队？" class="headerlink" title="为什么要组建一个团队？"></a>为什么要组建一个团队？</h3>
                  <p>据以往经验来看，产出一篇优质的技术文章所耗费的时间是相当长的，读者很难从单个作者那里获得成体系且覆盖面较广的知识。 对于作者而言，粉丝的积累、文章的持续编写都是一个慢跑的过程，很多优秀的作者会因为没有粉丝、投入产出比差距太大等因素暂停写作。 更有甚者，文章内容质量越来越低、标题越来越唬、广告越来越多，一门心思放在如何涨粉和变现上。如此循环，就会影响技术圈和技术生态的发展。 如果将那些优秀的作者联合起来（即合纵连横），共同运营一系列的技术媒体号，在不增加负担的情况下还能保证优质文章的产出，会怎么样呢？</p>
                  <p><img src="https://pic4.zhimg.com/80/v2-6aabf56efac5d50da23dc660ad72fe1b_hd.jpg" alt=""></p>
                  <p>我们也不知道，但是我们已经意识到了这个问题，也尝试着寻找答案。这就是夜幕团队 NightTeam 组建的大环境背景。</p>
                  <h3 id="团队希望做什么事？"><a href="#团队希望做什么事？" class="headerlink" title="团队希望做什么事？"></a>团队希望做什么事？</h3>
                  <p>互联网是开放的，技术也是开放的。适当的共享可以加速我们的进步，无论是作者还是读者，都能够从分享的过程中获得一些东西。</p>
                  <p><img src="https://pic1.zhimg.com/80/v2-3f85ec0502edafca985cc1f8db399e30_hd.jpg" alt=""></p>
                  <p>在开发中，我们会用到一些库和框架。当我刚编程入门的时候，就想着有一天也要编写开源项目，为其他开发者提供帮助，这正是夜幕团队想要做的事。 当然，我们想要做的不仅仅是编写一个开源项目，我们更希望将知识分享出去，在技术圈中传播开来，让更多的人得到帮助。</p>
                  <h3 id="团队能够为开发者带来什么？"><a href="#团队能够为开发者带来什么？" class="headerlink" title="团队能够为开发者带来什么？"></a>团队能够为开发者带来什么？</h3>
                  <p>读者的能力和阶段各不相同，每个人收获的内容也不同。宽泛来说，读者可以从夜幕团队 NightTeam 输出的内容中获得对待问题的处理方式、分析问题的思路、解决问题的技巧、问题背后的逻辑等。 假如你是一名中级爬虫工程师，那么夜幕团队 NightTeam 输出的内容可能会为你解决工作中遇到的一些反爬虫问题带来思路。</p>
                  <p><img src="https://pic1.zhimg.com/80/v2-6660c7d888e868dc1fc55caaa3a98938_hd.jpg" alt=""></p>
                  <p>如果你是一名后端开发者，你也许会从我们发布的类似于【动图演示 - Redis 持久化 RDB/AOF 详解与实践】这样的文章中了解到 Redis 持久化的两种选择的差异和具体操作过程。</p>
                  <h3 id="团队以什么形式输出内容？"><a href="#团队以什么形式输出内容？" class="headerlink" title="团队以什么形式输出内容？"></a>团队以什么形式输出内容？</h3>
                  <p>大多数情况下，我们会用文章的形式输出内容。但有时候也会采用直播或者视频教程的方式，兴许还会有现场交流的机会。 为了增加传播广度和影响力，我们不仅仅在微信公众号上发布文章，还会将文章同步到业内著名的几个平台，例如掘金社区、CSDN、SF、V2EX、知乎、今日头条等。</p>
                  <h3 id="团队成员介绍"><a href="#团队成员介绍" class="headerlink" title="团队成员介绍"></a>团队成员介绍</h3>
                  <p>团队的成员都是爬虫领域比较活跃的作者，同时也拥有非常强的实力，不会是那些只负责回答小白问题骗钱的盗名之辈。以下将不分先后的列出团队成员姓名、昵称和各自的介绍。 周子淇</p>
                  <blockquote>
                    <p>昵称 Loco 前微信公众号「小周码字」号主、知乎专栏「手把手教你写爬虫」作者，幂度爬虫工程师。啥方向都搞，除了爬虫相关的文章以外还会写一些灰黑产操作研究、机器学习、造轮子、物联网设备研究、脑洞分享等各种奇奇怪怪的东西。</p>
                  </blockquote>
                  <p>韦世东</p>
                  <blockquote>
                    <p>昵称 Asyncins 图灵签约作者、电子工业出版社约稿作者、华为云认证云享专家、掘金社区优秀作者、GitChat 认证作者、搜狐产品技术约稿作者、开源项目 aiowebsocket 作者、微信公众号「Rust之禅」号主、「进击的Coder」运营者之一，有着丰富的爬虫经验，擅长反爬虫的绕过技巧。</p>
                  </blockquote>
                  <p>崔庆才</p>
                  <blockquote>
                    <p>昵称 静觅 畅销书《Python3网络爬虫开发实战》作者、微信公众号「进击的Coder」号主，微软中国工程师。主要研究网络爬虫、机器学习、Web 开发相关内容。</p>
                  </blockquote>
                  <p>陈祥安</p>
                  <blockquote>
                    <p>昵称 CXA 微信公众号「Python学习开发」号主、CSDN 线下沙龙特邀讲师、华为云享社区专家、阿里云栖社区专家、哔哩哔哩《陈祥安分析Python面试题》系列 UP 主、GitChat 热门文章《Python 常见的 170 道面试题全解析：2019 版》作者，马蜂窝高级爬虫工程师。</p>
                  </blockquote>
                  <p>唐轶飞</p>
                  <blockquote>
                    <p>昵称 大鱼 BruceDone 微信公众号「大鱼鱼塘」号主、「<a href="https://link.zhihu.com/?target=http%3A//brucedone.com">http://brucedone.com</a>」站长，腾讯后端工程师。多年 Code 经验，擅长后端开发，语言了解但不限于 .NET、Python、Golang、SQL，兴趣包含但不限于爬虫，后端，数据库，深度学习。</p>
                  </blockquote>
                  <p>冯 威</p>
                  <blockquote>
                    <p>昵称 妄为 微信公众号「妄为写代码」号主，爬虫 Coder，佛系程序员。专注 JavaScript、Android 逆向以及验证码破解，对逆向有着丰富的经验。</p>
                  </blockquote>
                  <p>蔡 晋</p>
                  <blockquote>
                    <p>昵称 悦来客栈的老板 微信公众号「菜鸟学Python编程」号主，平时喜欢研究各大网站的反爬，熟悉常见网站的反爬操作，对反爬有着独到的见解。</p>
                  </blockquote>
                  <p>戴煌金</p>
                  <blockquote>
                    <p>昵称 咸鱼 微信公众号「咸鱼学Python」号主、华为云享专家。专注Python爬虫、JavaScript逆向，立志做一条最咸的咸鱼。</p>
                  </blockquote>
                  <p>张冶青</p>
                  <blockquote>
                    <p>昵称：MarvinZ 微信公众号「Crawlab漫游指南」号主、爬虫管理平台 Crawlab 作者、文章发布平台 ArtiPub 作者，知名外企前端开发工程师。专注前端、爬虫和数据分析。</p>
                  </blockquote>
                  <h3 id="如何与夜幕取得联系？"><a href="#如何与夜幕取得联系？" class="headerlink" title="如何与夜幕取得联系？"></a>如何与夜幕取得联系？</h3>
                  <p>你可能想跟夜幕团队交流一些技术方面的问题，你可以发送消息“夜幕读者群”到我们的公众号「NightTeam」加入读者群，团队成员都在群里等你。 一些重要的事可以通过邮件与夜幕取得联系，夜幕团队的邮箱为 contact@nightteam.cn。 GitHub 也准备好了，我们会逐渐将开源项目迁移到团队的仓库中，地址为 <a href="https://link.zhihu.com/?target=https%3A//github.com/nightteam">https://github.com/nightteam</a></p>
                  <p><img src="https://pic2.zhimg.com/80/v2-259f9f2796f2bcc2980cefc49ed237d5_hd.jpg" alt=""></p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-12 12:25:31" itemprop="dateCreated datePublished" datetime="2019-09-12T12:25:31+08:00">2019-09-12</time>
                </span>
                <span id="/7447.html" class="post-meta-item leancloud_visitors" data-flag-title="今天，大佬云集的夜幕团队正式成立了！" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>2.4k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>2 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7436.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7436.html" class="post-title-link" itemprop="url">爬虫智能解析库 Readability 和 Newspaper 的用法</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>舆情爬虫是网络爬虫一个比较重要的分支，舆情爬虫往往需要爬虫工程师爬取几百几千个新闻站点。比如一个新闻页面我们需要爬取其标题、正文、时间、作者等信息，如果用传统的方式来实现，每一个站点都要配置非常多的规则，如果要维护一个几百上千的站点，那人力成本简直太高了。 如果有一种方式可以在保证差不多的准确率的前提下，大幅提高提取效率的话，就需要用到智能文本提取了。 本文首先介绍一下智能文本提取的基本原理，让大家对智能提取有基本的了解。然后介绍几个比较基础的工具包，准确率并不是很高，可以尝试一用。最后再介绍几篇比较前沿的技术供大家参考。</p>
                  <h2 id="智能文本提取"><a href="#智能文本提取" class="headerlink" title="智能文本提取"></a>智能文本提取</h2>
                  <p>目前来说，智能文本提取可以分为三类：</p>
                  <ul>
                    <li>基于网页文档内容的提取方法</li>
                    <li>基于 DOM 结构信息的提取方法</li>
                    <li>基于视觉信息的提取方法</li>
                  </ul>
                  <p>基于网页文档的提取方法将 HTML 文档视为文本进行处理，适用于处理含有大量文本信息且结构简单易于处理的单记录网页，或者具有实时要求的在线分析网页应用。 这种方式主要利用自然语言处理相关技术实现，通过理解 文本语义、分析上下文、设定提取规则等，实现对大段网页文档的快速处理。其中，较为知名的方法有TSIMMIS、Web-OQL、Serrano、FAR-SW 和 FOREST，但这些方法由于通常需要人工的参与，且存在耗时长、效率低的弊端。 基于 DOM 结构信息的方法将 HTML 文档解析为相应的 DOM 树，然后根据 DOM 树的语法结构创建提取规则， 相对于以前的方法而言有了更高的性能和准确率。 W4F 和 XWRAP 将 HTML 文档解析成 DOM 树，然后通过组件化引导用户通过人工选择或者标记生成目标包装器代码。Omini、IEPAD 和 ITE 提取 DOM 树上的关键路径， 获取其中存在的重复模式。MDR 和 DEPTA 挖掘了页面中的数据区域，得到数据记录的模式。CECWS 通过聚类算法从数据库中提取出自同一网站的一组页面，并进行 DOM 树结构的对比，删除其中的静态部分，保留动态内容作为信息提取的结果。虽然此类方法相对于上一类方法 具有较高的提取精度，且克服了对大段连续文本的依赖， 但由于网页的 DOM 树通常较深，含有大量 DOM 节点， 因此基于 DOM 结构信息的方法具有较高的时间和空间消耗。目前来说，大部分原理还是基于 DOM 节点的文本密度、标点符号密度等计算的，其准确率还是比较可观的。今天所介绍的 Readability 和 Newspaper 的库的实现原理就是类似。 目前比较先进的是基于视觉信息的网页信息提取方法，通过浏览器接口或者内核对目标网页预渲染，然后基于网页的视觉规律提取网页数据记录。经典的 VIPS 算法首先从 DOM 树中提取出所有合适的页面区域，然后根据这些页面和分割条重新构建 Web 页面的语义结构。作为对 VIPS 的拓展，ViNT、ViPER、ViDE 也成功利用了网页的视觉特征来实现数据提取。CMDR 为通过神经网络学习多记录型页面中的特征，结合基于 DOM 结构信息的 MDR 方法，挖掘社区论坛页面的数据区域。与上述方法不同，VIBS 将图像领域的 CNN 卷积神经网络运用于网页的截图，同时通过类 VIPS 算法生成视觉块，最后结合两个阶段的结果识别网页的正文区域。另外还有最新的国内提出的 VBIE 方法，基于网页视觉的基础上改进，可以实现无监督的网页信息提取。</p>
                  <blockquote>
                    <p>以上内容主要参考自论文：《王卫红等：基于可视块的多记录型复杂网页信息提取算法》，算法可从该论文参考文献查阅。</p>
                  </blockquote>
                  <p>下面我们来介绍两个比较基础的工具包 Readability 和 Newspaper 的用法，这两个包经我测试其实准确率并不是很好，主要是让大家大致对智能解析有初步的理解。后面还会介绍一些更加强大的智能化解析算法。</p>
                  <h2 id="Readability"><a href="#Readability" class="headerlink" title="Readability"></a>Readability</h2>
                  <p>Readability 实际上是一个算法，并不是一个针对某个语言的库。其主要原理就是计算了 DOM 的文本密度，另外根据一些常见的 DOM 属性如 id、class 等计算了一些 DOM 的权重，最后分析得到了对应的 DOM 区块，进而提取出具体的文本内容。 现在搜索 Readability 其实已经找不到了，取而代之的是一个 JavaScript 工具包，叫做 mercury-parser，据我所知应该是 Readability 不维护了，换成了 mercury-parser。后者现在也做成了一个 Chrome 插件，大家可以下载使用一下。 回归正题，这次主要介绍的是 Python 的 Readability 实现，现在其实有很多开源版本，本文选取的是 <a href="https://github.com/buriy/python-readability，是基于最早的" target="_blank" rel="noopener">https://github.com/buriy/python-readability，是基于最早的</a> Python 版本的 Readability 库 <a href="https://github.com/timbertson/python-readability" target="_blank" rel="noopener">https://github.com/timbertson/python-readability</a> 二次开发的，现在已经发布到了 PyPi，大家可以直接下载安装使用。 安装很简单，通过 pip 安装即可：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">pip3 <span class="keyword">install</span> readability-lxml</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装好了之后便可以通过导入 readability 使用了，下面我们随便从网上找一个新闻页面，比如：<a href="https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html，其页面截图如下" target="_blank" rel="noopener">https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html，其页面截图如下</a>： <img src="https://qiniu.cuiqingcai.com/2019-09-09-125723.png" alt="页面示例"> 我们的目的就是它的正文、标题等内容。下面我们用 Readability 试一下，示例如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line"><span class="keyword">from</span> readability import Document</span><br><span class="line"></span><br><span class="line">url = <span class="string">'https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html'</span></span><br><span class="line">html = requests.<span class="builtin-name">get</span>(url).content</span><br><span class="line">doc = Document(html)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'title:'</span>, doc.title())</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'content:'</span>, doc.summary(<span class="attribute">html_partial</span>=<span class="literal">True</span>))</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们直接用 requests 库对网页进行了请求，获取了其 HTML 页面内容，赋值为 html。 然后引入了 readability 里的 Document 类，使用 html 变量对其进行初始化，然后我们分别调用了 title 方法和 summary 方法获得了其标题和正文内容。 这里 title 方法就是获取文章标题的，summary 就是获取文章正文的，但是它获取的正文可能包含一些 HTML 标签。这个 summary 方法可以接收一个 html_partial 参数，如果设置为 True，返回的结果则不会再带有 <code>&lt;html&gt;&lt;body&gt;</code> 标签。 看下运行结果：</p>
                  <figure class="highlight xml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">title: 今年iPhone只有小改进？分析师：还有其他亮点_网易科技</span><br><span class="line">content: <span class="tag">&lt;<span class="name">div</span>&gt;</span><span class="tag">&lt;<span class="name">div</span> <span class="attr">class</span>=<span class="string">"post_text"</span> <span class="attr">id</span>=<span class="string">"endText"</span>&gt;</span>           </span><br><span class="line">                    <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"otitle"</span>&gt;</span></span><br><span class="line">                        （原标题：Apple Bets More Cameras Can Keep iPhone Humming）</span><br><span class="line">                    <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">                    <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"f_center"</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">alt</span>=<span class="string">"今年iPhone只有小改进？分析师：还有其他亮点"</span> <span class="attr">src</span>=<span class="string">"http://cms-bucket.ws.126.net/2019/09/09/d65ba32672934045a5bfadd27f704bc1.jpeg"</span>/&gt;</span><span class="tag">&lt;<span class="name">span</span>&gt;</span>图示：苹果首席执行官蒂姆·库克(Tim Cook)在6月份举行的苹果全球开发者大会上。<span class="tag">&lt;/<span class="name">span</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>&gt;</span>网易科技讯 9月9日消息，据国外媒体报道，和过去的12个年头一样，新款</span><br><span class="line">... 中间省略 ...</span><br><span class="line">                    <span class="tag">&lt;<span class="name">p</span>&gt;</span>苹果还即将推出包括电视节目和视频游戏等内容的新订阅服务。分析师表示，该公司最早可能在本周宣布TV+和Arcade等服务的价格和上线时间。<span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>&gt;</span>Strategy Analytics的尼尔·莫斯顿(Neil Mawston)表示，可穿戴设备和服务的结合将是苹果业务超越iPhone的关键。他说，上一家手机巨头诺基亚公司在试图进行类似业务转型时就陷入了困境之中。（辰辰）<span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>相关报道：<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"https://tech.163.com/19/0908/09/EOHS53RK000999LD.html"</span> <span class="attr">target</span>=<span class="string">"_self"</span> <span class="attr">urlmacroreplace</span>=<span class="string">"false"</span>&gt;</span>iPhone 11背部苹果Logo改为居中：为反向无线充电<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"https://tech.163.com/19/0907/08/EOF60CBC00097U7S.html"</span> <span class="attr">target</span>=<span class="string">"_self"</span> <span class="attr">urlmacroreplace</span>=<span class="string">"false"</span>&gt;</span>2019年新iPhone传言汇总，你觉得哪些能成真<span class="tag">&lt;/<span class="name">a</span>&gt;</span>  <span class="tag">&lt;/<span class="name">p</span>&gt;</span><span class="tag">&lt;<span class="name">p</span>/&gt;</span></span><br><span class="line">                        <span class="tag">&lt;<span class="name">p</span>/&gt;</span></span><br><span class="line">                        <span class="tag">&lt;<span class="name">div</span> <span class="attr">class</span>=<span class="string">"ep-source cDGray"</span>&gt;</span></span><br><span class="line">                            <span class="tag">&lt;<span class="name">span</span> <span class="attr">class</span>=<span class="string">"left"</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://tech.163.com/"</span>&gt;</span><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">"https://static.ws.126.net/cnews/css13/img/end_tech.png"</span> <span class="attr">alt</span>=<span class="string">"王凤枝"</span> <span class="attr">class</span>=<span class="string">"icon"</span>/&gt;</span><span class="tag">&lt;/<span class="name">a</span>&gt;</span> 本文来源：网易科技报道  <span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line">                            <span class="tag">&lt;<span class="name">span</span> <span class="attr">class</span>=<span class="string">"ep-editor"</span>&gt;</span>责任编辑：王凤枝_NT2541<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line">                        <span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">                <span class="tag">&lt;/<span class="name">div</span>&gt;</span> </span><br><span class="line"><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到，标题提取是正确的。正文其实也是正确的，不过这里还包含了一些 HTML 标签，比如 <code>&lt;img&gt;</code>、<code>&lt;p&gt;</code> 等，我们可以进一步通过一些解析库来解析。 看下源码吧，比如提取标题的方法：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">normalize_entities</span><span class="params">(cur_title)</span>:</span></span><br><span class="line">    entities = &#123;</span><br><span class="line">        <span class="string">u'u2014'</span>:<span class="string">'-'</span>,</span><br><span class="line">        <span class="string">u'u2013'</span>:<span class="string">'-'</span>,</span><br><span class="line">        <span class="string">u'&amp;mdash;'</span>: <span class="string">'-'</span>,</span><br><span class="line">        <span class="string">u'&amp;ndash;'</span>: <span class="string">'-'</span>,</span><br><span class="line">        <span class="string">u'u00A0'</span>: <span class="string">' '</span>,</span><br><span class="line">        <span class="string">u'u00AB'</span>: <span class="string">'"'</span>,</span><br><span class="line">        <span class="string">u'u00BB'</span>: <span class="string">'"'</span>,</span><br><span class="line">        <span class="string">u'&amp;quot;'</span>: <span class="string">'"'</span>,</span><br><span class="line">    &#125;</span><br><span class="line">    <span class="keyword">for</span> c, r <span class="keyword">in</span> entities.items():</span><br><span class="line">        <span class="keyword">if</span> c <span class="keyword">in</span> cur_title:</span><br><span class="line">            cur_title = cur_title.replace(c, r)</span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> cur_title</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">norm_title</span><span class="params">(title)</span>:</span></span><br><span class="line">    <span class="keyword">return</span> normalize_entities(normalize_spaces(title))</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_title</span><span class="params">(doc)</span>:</span></span><br><span class="line">    title = doc.find(<span class="string">'.//title'</span>)</span><br><span class="line">    <span class="keyword">if</span> title <span class="keyword">is</span> <span class="literal">None</span> <span class="keyword">or</span> title.text <span class="keyword">is</span> <span class="literal">None</span> <span class="keyword">or</span> len(title.text) == <span class="number">0</span>:</span><br><span class="line">        <span class="keyword">return</span> <span class="string">'[no-title]'</span></span><br><span class="line"></span><br><span class="line">    <span class="keyword">return</span> norm_title(title.text)</span><br><span class="line"></span><br><span class="line"> <span class="function"><span class="keyword">def</span> <span class="title">title</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""Returns document title"""</span></span><br><span class="line">    <span class="keyword">return</span> get_title(self._html(<span class="literal">True</span>))</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>title 方法实际上就是调用了一个 get_title 方法，它怎么做的？实际上就是用了一个 XPath 只解析了 <code>&lt;title&gt;</code> 标签里面的内容，别的没了。如果没有，那就返回 <code>[no-title]</code>。</p>
                  <figure class="highlight lasso">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def summary(<span class="built_in">self</span>, html_partial=<span class="literal">False</span>):</span><br><span class="line">    ruthless = <span class="literal">True</span></span><br><span class="line">    <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">        <span class="built_in">self</span>._html(<span class="literal">True</span>)</span><br><span class="line">        for i <span class="keyword">in</span> <span class="built_in">self</span>.tags(<span class="built_in">self</span>.html, <span class="string">'script'</span>, <span class="string">'style'</span>):</span><br><span class="line">            i.drop_tree()</span><br><span class="line">        for i <span class="keyword">in</span> <span class="built_in">self</span>.tags(<span class="built_in">self</span>.html, <span class="string">'body'</span>):</span><br><span class="line">            i.<span class="built_in">set</span>(<span class="string">'id'</span>, <span class="string">'readabilityBody'</span>)</span><br><span class="line">        <span class="keyword">if</span> ruthless:</span><br><span class="line">            <span class="built_in">self</span>.remove_unlikely_candidates()</span><br><span class="line">        <span class="built_in">self</span>.transform_misused_divs_into_paragraphs()</span><br><span class="line">        candidates = <span class="built_in">self</span>.score_paragraphs()</span><br><span class="line"></span><br><span class="line">        best_candidate = <span class="built_in">self</span>.select_best_candidate(candidates)</span><br><span class="line"></span><br><span class="line">        <span class="keyword">if</span> best_candidate:</span><br><span class="line">            article = <span class="built_in">self</span>.get_article(candidates, best_candidate,</span><br><span class="line">                                       html_partial=html_partial)</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">if</span> ruthless:</span><br><span class="line">                ruthless = <span class="literal">False</span></span><br><span class="line">                continue</span><br><span class="line">            <span class="keyword">else</span>:</span><br><span class="line">                article = <span class="built_in">self</span>.html.find(<span class="string">'body'</span>)</span><br><span class="line">                <span class="keyword">if</span> article is <span class="literal">None</span>:</span><br><span class="line">                    article = <span class="built_in">self</span>.html</span><br><span class="line">        cleaned_article = <span class="built_in">self</span>.sanitize(article, candidates)</span><br><span class="line">        article_length = len(cleaned_article <span class="literal">or</span> <span class="string">''</span>)</span><br><span class="line">        retry_length = <span class="built_in">self</span>.retry_length</span><br><span class="line">        of_acceptable_length = article_length &gt;= retry_length</span><br><span class="line">        <span class="keyword">if</span> ruthless <span class="literal">and</span> <span class="literal">not</span> of_acceptable_length:</span><br><span class="line">            ruthless = <span class="literal">False</span></span><br><span class="line">            continue</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">return</span> cleaned_article</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里我删除了一些冗余的调试的代码，只保留了核心的代码，其核心实现就是先去除一些干扰内容，然后找出一些疑似正文的 candidates，然后再去寻找最佳匹配的 candidates 最后提取其内容返回即可。 然后再找到获取 candidates 方法里面的 score_paragraphs 方法，又追踪到一个 score_node 方法，就是为每一个节点打分的，其实现如下：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def score_node(self, elem):</span><br><span class="line">   content_score = self.class_weight(elem)</span><br><span class="line">   <span class="type">name</span> = elem.tag.lower()</span><br><span class="line">   <span class="keyword">if</span> <span class="type">name</span> <span class="keyword">in</span> ["div", "article"]:</span><br><span class="line">       content_score += <span class="number">5</span></span><br><span class="line">   elif <span class="type">name</span> <span class="keyword">in</span> ["pre", "td", "blockquote"]:</span><br><span class="line">       content_score += <span class="number">3</span></span><br><span class="line">   elif <span class="type">name</span> <span class="keyword">in</span> ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:</span><br><span class="line">       content_score -= <span class="number">3</span></span><br><span class="line">   elif <span class="type">name</span> <span class="keyword">in</span> ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:</span><br><span class="line">       content_score -= <span class="number">5</span></span><br><span class="line">   <span class="keyword">return</span> &#123;</span><br><span class="line">       <span class="string">'content_score'</span>: content_score,</span><br><span class="line">       <span class="string">'elem'</span>: elem</span><br><span class="line">   &#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这什么意思呢？你看如果这个节点标签是 div 或者 article 等可能表征正文区块的话，就加 5 分，如果是 aside 等表示侧栏的内容就减 3 分。这些打分也没有什么非常标准的依据，可能是根据经验累积的规则。 另外还有一些方法里面引用了一些正则匹配来进行打分或者替换，其定义如下：</p>
                  <figure class="highlight gherkin">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"> REGEXES = &#123;</span><br><span class="line">    'unlikelyCandidatesRe': re.compile('combx|<span class="string">comment</span>|<span class="string">community</span>|<span class="string">disqus</span>|<span class="string">extra</span>|<span class="string">foot</span>|<span class="string">header</span>|<span class="string">menu</span>|<span class="string">remark</span>|<span class="string">rss</span>|<span class="string">shoutbox</span>|<span class="string">sidebar</span>|<span class="string">sponsor</span>|<span class="string">ad-break</span>|<span class="string">agegate</span>|<span class="string">pagination</span>|<span class="string">pager</span>|<span class="string">popup</span>|<span class="string">tweet</span>|<span class="string">twitter', re.I),</span></span><br><span class="line"><span class="string">    'okMaybeItsACandidateRe': re.compile('and</span>|<span class="string">article</span>|<span class="string">body</span>|<span class="string">column</span>|<span class="string">main</span>|<span class="string">shadow', re.I),</span></span><br><span class="line"><span class="string">    'positiveRe': re.compile('article</span>|<span class="string">body</span>|<span class="string">content</span>|<span class="string">entry</span>|<span class="string">hentry</span>|<span class="string">main</span>|<span class="string">page</span>|<span class="string">pagination</span>|<span class="string">post</span>|<span class="string">text</span>|<span class="string">blog</span>|<span class="string">story', re.I),</span></span><br><span class="line"><span class="string">    'negativeRe': re.compile('combx</span>|<span class="string">comment</span>|<span class="string">com-</span>|<span class="string">contact</span>|<span class="string">foot</span>|<span class="string">footer</span>|<span class="string">footnote</span>|<span class="string">masthead</span>|<span class="string">media</span>|<span class="string">meta</span>|<span class="string">outbrain</span>|<span class="string">promo</span>|<span class="string">related</span>|<span class="string">scroll</span>|<span class="string">shoutbox</span>|<span class="string">sidebar</span>|<span class="string">sponsor</span>|<span class="string">shopping</span>|<span class="string">tags</span>|<span class="string">tool</span>|<span class="string">widget', re.I),</span></span><br><span class="line"><span class="string">    'divToPElementsRe': re.compile('&lt;(a</span>|<span class="string">blockquote</span>|<span class="string">dl</span>|<span class="string">div</span>|<span class="string">img</span>|<span class="string">ol</span>|<span class="string">p</span>|<span class="string">pre</span>|<span class="string">table</span>|<span class="string">ul)', re.I),</span></span><br><span class="line"><span class="string">    #'replaceBrsRe': re.compile('(&lt;br[^&gt;]*&gt;[ nrt]*)&#123;2,&#125;',re.I),</span></span><br><span class="line"><span class="string">    #'replaceFontsRe': re.compile('&lt;(/?)font[^&gt;]*&gt;',re.I),</span></span><br><span class="line"><span class="string">    #'trimRe': re.compile('^s+</span>|<span class="string">s+$/'),</span></span><br><span class="line"><span class="string">    #'normalizeRe': re.compile('s&#123;2,&#125;/'),</span></span><br><span class="line"><span class="string">    #'killBreaksRe': re.compile('(&lt;brs*/?&gt;(s</span>|<span class="string">&amp;nbsp;?)*)&#123;1,&#125;/'),</span></span><br><span class="line"><span class="string">    'videoRe': re.compile('https?://(www.)?(youtube</span>|<span class="string">vimeo).com', re.I),</span></span><br><span class="line"><span class="string">    #skipFootnoteLink:      /^s*([?[a-z0-9]&#123;1,2&#125;]?</span>|<span class="string">^</span>|<span class="string">edit</span>|<span class="string">citation needed)s*$/i,</span></span><br><span class="line"><span class="string">&#125;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>比如这里定义了 unlikelyCandidatesRe，就是不像 candidates 的 pattern，比如 foot、comment 等等，碰到这样的标签或 pattern 的话，在计算分数的时候都会减分，另外还有其他的 positiveRe、negativeRe 也是一样的原理，分别对匹配到的内容进行加分或者减分。 这就是 Readability 的原理，是基于一些规则匹配的打分模型，很多规则其实来源于经验的累积，分数的计算规则应该也是不断地调优得出来的。 另外其他的就没了，Readability 并没有提供提取时间、作者的方法，另外此种方法的准确率也是有限的，但多少还是省去了一些人工成本。</p>
                  <h2 id="Newspaper"><a href="#Newspaper" class="headerlink" title="Newspaper"></a>Newspaper</h2>
                  <p>另外还有一个智能解析的库，叫做 Newspaper，提供的功能更强一些，但是准确率上个人感觉和 Readability 差不太多。 这个库分为 Python2 和 Python3 两个版本，Python2 下的版本叫做 newspaper，Python3 下的版本叫做 newspaper3k，这里我们使用 Python3 版本来进行测试。 其 GitHub 地址是：<a href="https://github.com/codelucas/newspaper，官方文档地址是：[https://newspaper.readthedocs.io](https://newspaper.readthedocs.io/" target="_blank" rel="noopener">https://github.com/codelucas/newspaper，官方文档地址是：[https://newspaper.readthedocs.io](https://newspaper.readthedocs.io/</a>)。 在安装之前需要安装一些依赖库，可以参考官方的说明：<a href="https://github.com/codelucas/newspaper#get-it-now" target="_blank" rel="noopener">https://github.com/codelucas/newspaper#get-it-now</a>。 安装好必要的依赖库之后，就可以使用 pip 安装了：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">pip3 <span class="keyword">install</span> newspaper3k</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装成功之后便可以导入使用了。 下面我们先用官方提供的实例来过一遍它的用法，官方提供的示例是使用了这个链接：<a href="https://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/，其页面截图如下" target="_blank" rel="noopener">https://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/，其页面截图如下</a>： <img src="https://qiniu.cuiqingcai.com/2019-09-09-125754.png" alt="官方示例"> 下面用一个实例来感受一下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> newspaper import Article</span><br><span class="line"></span><br><span class="line">url = <span class="string">'https://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'</span></span><br><span class="line">article = Article(url)</span><br><span class="line">article.download()</span><br><span class="line"><span class="comment"># print('html:', article.html)</span></span><br><span class="line"></span><br><span class="line">article.parse()</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'authors:'</span>, article.authors)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'date:'</span>, article.publish_date)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'text:'</span>, article.text)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'top image:'</span>, article.top_image)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'movies:'</span>, article.movies)</span><br><span class="line"></span><br><span class="line">article.nlp()</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'keywords:'</span>, article.keywords)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'summary:'</span>, article.summary)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里从 newspaper 库里面先导入了 Article 这个类，然后直接传入 url 即可，首先需要调用它的 download 方法，将网页爬取下来，否则直接进行解析会抛出错误的。</p>
                  <blockquote>
                    <p>但我总感觉这个设计挺不友好的，parse 方法不能判断下，如果没执行 download 就自动执行 download 方法吗？如果不 download 其他的不什么都干不了吗？</p>
                  </blockquote>
                  <p>好的，然后我们再执行 parse 方法进行网页的智能解析，这个功能就比较全了，能解析 authors、publish_date、text 等等，除了正文还能解析作者、发布时间等等。 另外这个库还提供了一些 NLP 的方法，比如获取关键词、获取文本摘要等等，在使用前需要先执行以下 nlp 方法。 最后运行结果如下：</p>
                  <figure class="highlight">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attribute">authors</span>: ['Cnn Wire']</span><br><span class="line"><span class="attribute">date</span>: 2013-12-30 00:00:00</span><br><span class="line"><span class="attribute">text</span>: By Leigh Ann Caldwell</span><br><span class="line"></span><br><span class="line">WASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014.</span><br><span class="line"></span><br><span class="line">Some 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibition in Illinois and transgender protections in California.</span><br><span class="line"></span><br><span class="line">Although many new laws are controversial, they made it through legislatures, public referendum or city councils and represent the shifting composition of American beliefs.</span><br><span class="line">...</span><br><span class="line">...</span><br><span class="line"><span class="attribute">Colorado</span>: Marijuana becomes legal in the state for buyers over 21 at a licensed retail dispensary.</span><br><span class="line"></span><br><span class="line">(Sourcing: much of this list was obtained from the National Conference of State Legislatures).</span><br><span class="line">top image: https://localtvkstu.files.wordpress.com/2012/04/national-news-e1486938949489.jpg?quality=85&amp;strip=all</span><br><span class="line"><span class="attribute">movies</span>: []</span><br><span class="line"><span class="attribute">keywords</span>: ['drones', 'national', 'guns', 'wage', 'law', 'pot', 'leave', 'family', 'states', 'state', 'latest', 'obamacare', 'minimum', 'laws']</span><br><span class="line"><span class="attribute">summary</span>: Oregon: Family leave in Oregon has been expanded to allow eligible employees two weeks of paid leave to handle the death of a family member.</span><br><span class="line"><span class="attribute">Arkansas</span>: The state becomes the latest state requiring voters show a picture ID at the voting booth.</span><br><span class="line">Minimum wage and former felon employmentWorkers in 13 states and four cities will see increases to the minimum wage.</span><br><span class="line">New Jersey residents voted to raise the state’s minimum wage by $1 to $8.25 per hour.</span><br><span class="line">California is also raising its minimum wage to $9 per hour, but workers must wait until July to see the addition.</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里省略了一些输出结果。 可以看到作者、日期、正文、关键词、标签、缩略图等信息都被打印出来了，还算是不错的。 但这个毕竟是官方的实例，肯定是好的，我们再测试一下刚才的例子，看看效果如何，网址还是：<a href="https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html，改写代码如下" target="_blank" rel="noopener">https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html，改写代码如下</a>：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> newspaper import Article</span><br><span class="line"></span><br><span class="line">url = <span class="string">'https://tech.163.com/19/0909/08/EOKA3CFB00097U7S.html'</span></span><br><span class="line">article = Article(url, <span class="attribute">language</span>=<span class="string">'zh'</span>)</span><br><span class="line">article.download()</span><br><span class="line"><span class="comment"># print('html:', article.html)</span></span><br><span class="line"></span><br><span class="line">article.parse()</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'authors:'</span>, article.authors)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'title:'</span>, article.title)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'date:'</span>, article.publish_date)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'text:'</span>, article.text)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'top image:'</span>, article.top_image)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'movies:'</span>, article.movies)</span><br><span class="line"></span><br><span class="line">article.nlp()</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'keywords:'</span>, article.keywords)</span><br><span class="line"><span class="builtin-name">print</span>(<span class="string">'summary:'</span>, article.summary)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里我们将链接换成了新闻的链接，另外在 Article 初始化的时候还加了一个参数 language，其值为 zh，代表中文。 然后我们看下运行结果：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Building prefix dict from <span class="regexp">/usr/</span>local<span class="regexp">/lib/</span>python3<span class="number">.7</span><span class="regexp">/site-packages/</span>jieba/dict.txt ...</span><br><span class="line">Dumping model to file cache <span class="regexp">/var/</span>folders<span class="regexp">/1g/</span>l2xlw12x6rncs2p9kh5swpmw0000gn<span class="regexp">/T/</span>jieba.cache</span><br><span class="line">Loading model cost <span class="number">1.7178938388824463</span> seconds.</span><br><span class="line">Prefix dict has been built succesfully.</span><br><span class="line"><span class="string">authors:</span> []</span><br><span class="line"><span class="string">title:</span> 今年iPhone只有小改进？分析师：还有其他亮点</span><br><span class="line"><span class="string">date:</span> <span class="number">2019</span><span class="number">-09</span><span class="number">-09</span> <span class="number">08</span>:<span class="number">10</span>:<span class="number">26</span>+<span class="number">08</span>:<span class="number">00</span></span><br><span class="line"><span class="string">text:</span> （原标题：Apple Bets More Cameras Can Keep iPhone Humming）</span><br><span class="line"></span><br><span class="line">图示：苹果首席执行官蒂姆·库克(Tim Cook)在<span class="number">6</span>月份举行的苹果全球开发者大会上。</span><br><span class="line"></span><br><span class="line">网易科技讯 <span class="number">9</span>月<span class="number">9</span>日消息，据国外媒体报道，和过去的<span class="number">12</span>个年头一样，新款iPhone将成为苹果公司本周所举行年度宣传活动的主角。但人们的注意力正转向需要推动增长的其他苹果产品和服务。</span><br><span class="line">...</span><br><span class="line">...</span><br><span class="line">Strategy Analytics的尼尔·莫斯顿(Neil Mawston)表示，可穿戴设备和服务的结合将是苹果业务超越iPhone的关键。他说，上一家手机巨头诺基亚公司在试图进行类似业务转型时就陷入了困境之中。（辰辰）</span><br><span class="line"></span><br><span class="line">相关报道：</span><br><span class="line"></span><br><span class="line">iPhone <span class="number">11</span>背部苹果Logo改为居中：为反向无线充电</span><br><span class="line"></span><br><span class="line"><span class="number">2019</span>年新iPhone传言汇总，你觉得哪些能成真</span><br><span class="line">top <span class="string">image:</span> <span class="string">https:</span><span class="comment">//www.163.com/favicon.ico</span></span><br><span class="line"><span class="string">movies:</span> []</span><br><span class="line"><span class="string">keywords:</span> [<span class="string">'trust高级投资组合经理丹摩根dan'</span>, <span class="string">'iphone'</span>, <span class="string">'mawston表示可穿戴设备和服务的结合将是苹果业务超越iphone的关键他说上一家手机巨头诺基亚公司在试图进行类似业务转型时就陷入了困境之中辰辰相关报道iphone'</span>, <span class="string">'xs的销售疲软状况迫使苹果在1月份下调了业绩预期这是逾15年来的第一次据贸易公司susquehanna'</span>, <span class="string">'xs机型发布后那种令人失望的业绩重演iphone'</span>, <span class="string">'今年iphone只有小改进分析师还有其他亮点'</span>, <span class="string">'more'</span>, <span class="string">'xr和iphone'</span>, <span class="string">'morgan说他们现在没有任何真正深入的进展只是想继续让iphone这款业务继续转下去他乐观地认为今年发布的新款手机将有足够多的新功能为一个非常成熟的产品增加额外的功能让火车继续前进这种仅限于此的态度说明了苹果自2007年发布首款iphone以来所面临的挑战iphone销售占苹果公司总营收的一半以上这让苹果陷入了一个尴尬的境地既要维持核心产品的销量另一方面又需要减少对它的依赖瑞银ubs今年5月份对8000名智能手机用户进行了相关调查其发布的年度全球调查报告显示最近iphone在人脸识别技术等方面的进步并没有引起一些消费者的共鸣他们基本上都认为苹果产品没有过去几年那么独特或者惊艳品牌也没有过去几年那么有吸引力很多人使用老款手机的时间更长自己认为也没有必要升级到平均售价949美元的新款iphone苹果需要在明年销售足够多的iphone以避免像去年9月份iphone'</span>, <span class="string">'keep'</span>, <span class="string">'原标题apple'</span>]</span><br><span class="line"><span class="string">summary:</span> （原标题：Apple Bets More Cameras Can Keep iPhone Humming）图示：苹果首席执行官蒂姆·库克(Tim Cook)在<span class="number">6</span>月份举行的苹果全球开发者大会上。网易科技讯 <span class="number">9</span>月<span class="number">9</span>日消息，据国外媒体报道，和过去的<span class="number">12</span>个年头一样，新款iPhone将成为苹果公司本周所举行...亚公司在试图进行类似业务转型时就陷入了困境之中。（辰辰）相关报道：iPhone <span class="number">11</span>背部苹果Logo改为居中：为反向无线充电<span class="number">2019</span>年新iPhone传言汇总，你觉得哪些能成真</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>中间正文很长省略了一部分，可以看到运行时首先加载了一些中文的库包，比如 jieba 所依赖的词表等等。 解析结果中，日期的确是解析对了，因为这个日期格式的的确比较规整，但这里还自动给我们加了东八区的时区，贴心了。作者没有提取出来，可能是没匹配到 <code>来源</code> 两个字吧，或者词库里面没有，标题、正文的提取还算比较正确，也或许这个案例的确是比较简单。 另外对于 NLP 部分，获取的关键词比较迷，长度有点太长了。summary 也有点冗余。 另外 Newspaper 还提供了一个较为强大的功能，就是 build 构建信息源。官方的介绍其功能就是构建一个新闻源，可以根据传入的 URL 来提取相关文章、分类、RSS 订阅信息等等。 我们用实例感受一下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import newspaper</span><br><span class="line"></span><br><span class="line">source = newspaper.build(<span class="string">'http://www.sina.com.cn/'</span>, <span class="attribute">language</span>=<span class="string">'zh'</span>)</span><br><span class="line"><span class="keyword">for</span> category <span class="keyword">in</span> source.category_urls():</span><br><span class="line">    <span class="builtin-name">print</span>(category)</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> article <span class="keyword">in</span> source.articles:</span><br><span class="line">    <span class="builtin-name">print</span>(article.url)</span><br><span class="line">    <span class="builtin-name">print</span>(article.title)</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> feed_url <span class="keyword">in</span> source.feed_urls():</span><br><span class="line">    <span class="builtin-name">print</span>(feed_url)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们传入了新浪的官网，调用了 build 方法，构建了一个 source，然后输出了相关的分类、文章、RSS 订阅等内容，运行结果如下：</p>
                  <figure class="highlight vim">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">http://cul.news.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://www.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/</span><br><span class="line">http://sc.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://jiangsu.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://gif.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">....</span><br><span class="line">http://<span class="keyword">tj</span>.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://jiaoyi.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http://cul.sina.<span class="keyword">com</span>.<span class="keyword">cn</span></span><br><span class="line">http<span class="variable">s:</span>//finance.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/roll/<span class="number">2019</span>-<span class="number">06</span>-<span class="number">12</span>/doc-ihvhiqay5022316.shtml </span><br><span class="line">经参头版：激发微观主体活力加速国企改革</span><br><span class="line">http://eladies.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/feel/xinli/<span class="number">2018</span>-<span class="number">01</span>-<span class="number">25</span>/<span class="number">0722</span>/doc-ifyqwiqk0463751.shtml </span><br><span class="line">我们别再联系了</span><br><span class="line">http://finance.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/roll/<span class="number">2018</span>-<span class="number">05</span>-<span class="number">13</span>/doc-ihamfahx2958233.shtml </span><br><span class="line">新违约时代到来！违约“常态化”下的市场出清与换血</span><br><span class="line">http://sports.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/basketball/<span class="number">2019</span>worldcup/<span class="number">2019</span>-<span class="number">09</span>-<span class="number">08</span>/doc-iicezzrq4390554.shtml </span><br><span class="line">罗健儿<span class="number">26</span>分韩国收首胜</span><br><span class="line">...</span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/outbound/pages/<span class="number">2019</span>-<span class="number">09</span>-<span class="number">05</span>/detail-iicezzrq3622449.shtml </span><br><span class="line">菲律宾海滨大道 夜晚让人迷离</span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/outbound/pages/<span class="number">2016</span>-<span class="number">08</span>-<span class="number">19</span>/detail-ifxvcnrv0334779.shtml  </span><br><span class="line">关岛 用双脚尽情享受阳光与海滩</span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/domestic/pages/<span class="number">2019</span>-<span class="number">09</span>-<span class="number">04</span>/detail-iicezzrq3325092.shtml </span><br><span class="line">秋行查干浩特草原</span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/outbound/pages/<span class="number">2019</span>-<span class="number">09</span>-<span class="number">03</span>/detail-iicezueu3050710.shtml </span><br><span class="line">白羊座的土豪之城迪拜</span><br><span class="line">http://travel.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/video/baidang/<span class="number">2019</span>-<span class="number">08</span>-<span class="number">29</span>/detail-ihytcitn2747327.shtml </span><br><span class="line">肯辛顿宫藏着维多利亚的秘密</span><br><span class="line">http://<span class="keyword">cd</span>.auto.sina.<span class="keyword">com</span>.<span class="keyword">cn</span>/bdcs/<span class="number">2017</span>-<span class="number">08</span>-<span class="number">15</span>/detail-ifyixias1051586.shtml</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到它输出了非常多的类别链接，另外还有很多文章列表，由于没有 RSS 订阅内容，这里没有显示。 下面把站点换成我的博客：<a href="https://cuiqingcai.com，博客截图如下" target="_blank" rel="noopener">https://cuiqingcai.com，博客截图如下</a>： <img src="https://qiniu.cuiqingcai.com/2019-09-09-134550.png" alt="博客截图"> 看看运行结果：</p>
                  <figure class="highlight dts">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">https:</span><span class="comment">//cuiqingcai.com</span></span><br><span class="line"><span class="symbol">https:</span><span class="comment">//cuiqingcai.com</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>似乎不太行啊，一篇文章都没有，RSS 也没有，可见其功能还有待优化。 Newspaper 的基本用法介绍到这里，更加详细的用法可以参考官方文档：<a href="https://newspaper.readthedocs.io" target="_blank" rel="noopener">https://newspaper.readthedocs.io</a>。个人感觉其中的智能解析可以用用，不过据我的个人经验，感觉还是很多解析不对或者解析不全的， 以上便是 Readability 和 Newspaper 的介绍。</p>
                  <h2 id="其他方案"><a href="#其他方案" class="headerlink" title="其他方案"></a>其他方案</h2>
                  <p>另外除了这两个库其实还有一些比较优秀的算法，由于我们处理的大多为中文文档，所以一些在中文上面的研究是比较有效的，在这里列几个值得借鉴的中文论文供大家参考：</p>
                  <ul>
                    <li>洪鸿辉等，基于文本及符号密度的网页正文提取方法</li>
                    <li>梁东等，基于支持向量机的网页正文内容提取方法</li>
                    <li>王卫红等，基于可视块的多记录型复杂网页信息提取算法</li>
                  </ul>
                  <p>今天还看到一位大佬「青南」根据上面第一篇论文所实现的 GeneralNewsExtractor，GitHub 地址为：<a href="https://github.com/kingname/GeneralNewsExtractor，经测试准确率还不错，比" target="_blank" rel="noopener">https://github.com/kingname/GeneralNewsExtractor，经测试准确率还不错，比</a> Readability 和 Newspaper 的解析效果要好。我也跟作者进行了交流，后续可能还会基于其他的 Feature 或依赖于视觉化的方法进行优化，大家可以关注下，谢谢！</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-12 09:37:24" itemprop="dateCreated datePublished" datetime="2019-09-12T09:37:24+08:00">2019-09-12</time>
                </span>
                <span id="/7436.html" class="post-meta-item leancloud_visitors" data-flag-title="爬虫智能解析库 Readability 和 Newspaper 的用法" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>15k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>14 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7440.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7440.html" class="post-title-link" itemprop="url">谈谈 Zao 这个软件</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>这两天想必大家应该被一个软件刷屏了，它的名字叫做 Zao，中文音译就叫“造”。它为什么这么火呢？是因为我们可以上传自己的一张照片，他就能把我们的脸替换成一些热门视频的男主或女主的脸，也就是视频换脸。 比如有人尝试了把尼古拉斯赵四的脸换到美国队长的脸上，美队的气质简直就是被垄断了，大家可以扫码看看： <img src="https://qiniu.cuiqingcai.com/2019-08-31-151859.png" alt="美队变赵四"> 视频换脸技术大家应该早有耳闻，但这个软件有点意思，它抓住了几个点使得它一炮而红。 第一是这个软件的效果确实不错，我拿自己也做了实验，发现确实它渲染的一些结果几乎毫无违和感，毕竟这个软件核心拼的就是技术。 第二这个软件贴近于日常生活，我们可以把自己的照片上传，让我们真正成为视频里的主角。另外视频选材很有讲究，都是一些剪辑过的明星精彩镜头，这样我们生成的视频镜头会让我们有变成明星的感觉，非常有代入感。</p>
                  <h2 id="技术实现"><a href="#技术实现" class="headerlink" title="技术实现"></a>技术实现</h2>
                  <p>作为一名程序员，当然最关心的可能就是它的技术实现了，毋庸置疑它肯定是利用了深度学习的一些技术。我看了一些文章和调研，大体了解了一下，下面稍微分析一下里面用到的一些技术。 整体而言呢，这个过程分为三步，他们分别是：</p>
                  <ul>
                    <li>人脸定位</li>
                    <li>人脸转换</li>
                    <li>人脸融合</li>
                  </ul>
                  <h3 id="人脸定位"><a href="#人脸定位" class="headerlink" title="人脸定位"></a>人脸定位</h3>
                  <p>现在深度学习对于人脸识别和定位的研究技术已经非常成熟和精准了，其核心就是使用了卷积神经网络，即 CNN，不同的模型架构对于识别的准确率有不用的表现。 对于人脸的定位，一般是使用脸部的关键点定位的，这些点叫做 Landmarks。在一张人脸图像上，每张脸的轮廓和五官的位置都会被打上点，比如整个脸部的轮廓用一些点描出来，鼻子、眼睛、唇形同样用一些点描出来。 <img src="https://qiniu.cuiqingcai.com/2019-08-31-152358.jpg" alt="Facial Feature Detection"> 一般来说一张脸会用 68 个点来标记出来，每识别的模型接收一张人脸图像，输出这 68 个点的坐标，这样我们就可以实现人脸定位了。 现在现成的模型也很多了，比如 dlib，opencv 等开源工具包可以直接拿来使用了，如果要更精准地话可以使用更复杂的卷积神经网络模型来实现，大家可以了解下相关论文。</p>
                  <h3 id="人脸生成"><a href="#人脸生成" class="headerlink" title="人脸生成"></a>人脸生成</h3>
                  <p>有了标记点以后，这个软件就可以把我们的人脸提取出来了，但是这有个问题，我们上传的是一张静态图片，总不能直接生硬地替换进去吧，比如我们上传的是一张正脸照片，那视频里的一些侧脸画面直接贴上那不就没法看了吗？ 这时候就要用到另外一个核心技术叫做人脸生成技术，有了它我们就可以对人脸进行生成了，比如根据一张正脸图生成一张侧脸图。目前人脸生成技术主要有两种，有 GAN（生成对抗网络）和 VAE（变分自编码器），下面简单介绍一下它们的原理。 对于 GAN 来说，它叫做生成对抗网络，为什么叫对抗网络呢？是因为模型在训练的过程中一直有两个东西在做对抗，这俩东西分别叫 Generator（生成器）和 Discriminator（判别器）。前者主要负责生成一张人脸，越像越牛逼。后者主要负责判断分辨前者生成的人脸是不是真的，判定越准越牛逼。二者在这个过程中为了变得越来越牛逼，前者就会尽力去生成更像的人脸来欺骗后者，后者也会尽力去判别生成的人脸是不是真的来打击前者。这样二者在不断地训练和对抗过程中，前者生成的结果就会越来越好了。 对于 VAE 呢，它是通过一些无监督学习的方式将人脸信息进行压缩，由编码器把它表示成一个短向量，这些向量里就包含了人脸的基本信息，比如肤色、唇形等信息，这样整个模型就可以学习到人脸的共性。然后，解码器将向量解码，将其转换为某一特定的人脸。这样就等于经过一层中间向量完成了从一张人脸到另一张人脸的转换。</p>
                  <h3 id="图像融合"><a href="#图像融合" class="headerlink" title="图像融合"></a>图像融合</h3>
                  <p>最后的阶段就是图像融合了，也就是把生成的新的人脸和原来图像的背景融合，使之不会产生违和感。 在这个软件中，视频是由一帧一帧组成的，那么在转换的时候也需要一帧一帧处理，最后处理完成后再合成整个视频。 以上也就是我所了解到的变脸的一些方法。</p>
                  <h2 id="安全性"><a href="#安全性" class="headerlink" title="安全性"></a>安全性</h2>
                  <p>有人说，这个技术不是什么好技术。万一有人拿着我们的照片一变脸，就能够把我们任意的表情和头部动作模拟出来，拿着去做认证，比如刷脸支付什么的咋办，那我们的钱不就被盗刷了吗？ 对于这个问题，支付宝官方也做了回应，支付宝称刷脸支付实际上会通过软硬件结合的方式进行检测，其会判断被刷物体是否是照片、视频或者软件模拟的方式生成的，可以有效避免身份冒用情况。其中有一个核心技术就是通过 3D 结构光摄像头来进行信息采集和识别，如果被拍摄物体是平面的，也就是说如果是照片或者视频，是无法通过检测的。 <img src="https://qiniu.cuiqingcai.com/2019-08-31-152611.png" alt="支付宝回应"> 这时候我自然而然想到，既然用的是 3D 结构光摄像头，那么如果用了 3D 打印技术把一个人的肖像打印出来，或者用一个非常逼真的蜡像来进行刷脸识别，能不能通过呢？我看了一些报道，发现不少案例的确通过了刷脸测试，比如解开了 iPhone 面部识别锁等等。但要通过 3D 打印技术来模拟一个人的肖像成本还是蛮高的，所以基本上也不太会有人来搞这些。 如果对此还心有余悸的话，支付宝还回应称，即便是真的被盗刷了，支付宝也会通过保险公司进行全额赔付。 所以基本上是不用担心其安全性的，尤其是 Zao 这个软件的出现是没有对刷脸支付的风险造成大的影响的，其就是增加了一个活体视频模拟的实现，对刷脸支付的安全性没有出现大的突破性威胁。</p>
                  <h2 id="隐私性"><a href="#隐私性" class="headerlink" title="隐私性"></a>隐私性</h2>
                  <p>这个就要好好说一下了，这个软件的出现同时引起了另一个轩然大波，那就是其中的隐私条款。 其隐私条款有一条是这样的：</p>
                  <blockquote>
                    <p>用户上传发布内容后，意味着同意授予 ZAO 及其关联公司以及 ZAO 用户在“全球范围内完全免费、不可撤销、永久、可转授权和可再许可的权利”，“包括但不限于可以对用户内容进行全部或部分的修改与编辑（如将短视频中的人脸或者声音换成另一个人的人脸或者声音等）以及对修改前后的用户内容进行信息网络传播以及《著作权法》规定的由著作权人享有的全部著作财产权利及邻接权利”。</p>
                  </blockquote>
                  <p>这条款没人说还真没注意到，因为一般咱用一个软件，一般不会去仔细看它的条款，那么密密麻麻的一坨，有几个人会去仔细看呢？但要不同意，这个软件还没法用，所以用过这个软件的人，这个条款一定是已经同意了。 这条条款其实是很过分的，同意授予 Zao 及其关联公司以及 Zao 用户在“全球范围内完全免费、不可撤销、永久、可转授权和可再许可的权利。注意这里有几个字，完全免费、不可撤销、永久、可转授权、可再许可，这几个词就代表我们已经把我们的肖像权永久授予了 Zao 及其关联公司了，而且不能撤销，账号注销了也不能撤销，也就是以后它们可以有权利永久滥用我们的肖像。更可怕的是，其中还有一个词叫可转授权，那也就是说，Zao 可以对我们的肖像权进行转授权，你懂得，给点钱，啥办不到呢？这就更无法控制了，这可能就意味着，世界上任何一个人可能都能获得我们的肖像权。 所以说，如果你还没用的话，一定要谨慎谨慎再谨慎！ 哎，反正我已经同意了，貌似我现在也没什么办法了。</p>
                  <h2 id="社会影响"><a href="#社会影响" class="headerlink" title="社会影响"></a>社会影响</h2>
                  <p>这个软件的出现，更深一点想，其实它所隐含的影响还是蛮大的。 有了这个变脸技术，如果有人获得了我们在条款里面所”捐出“的肖像权，拿着我们的照片去生产那种你懂得的影片，把视频里面的男主或者女主换成我们的人脸，然后到处传播，或者以此作为敲诈勒索的工具。即便我们有理，那也说不清了，首先这个条款已经说了它们可以有权利随意使用我们的肖像，所以告侵犯肖像权已经行不通了，而且即使我们有证据证明这是假的，但这种视频的传播也一定会带来非常大的影响。 按照现在大众们的观念，比如说一张图，我们如果不信的话可以说它是 P 的，但如果换做是视频的话，很多人可能就会相信了，因为很多人不知道视频中的肖像也可以伪造得这么真了，毕竟很多人并不知道这种技术。因此，有了这种技术的出现，以后视频类的证据，可能也不可信了。因此这个软件的出现，可以说从另一个侧面昭示，以后视频也不能作为犯案的证据和验证人的真伪的依据了。 所以以后可能是这样子的：</p>
                  <ul>
                    <li>坐在电脑面前的网络女主播，即便不开美颜和滤镜，你所看到的她也不是真的她了。</li>
                    <li>你要给人打个钱，说开个视频吧，我看看是不是真的你，即便看到的是他，你也不能信了。</li>
                    <li>有人要 Qiao Zha 你，把你的人脸换成 Zuo An 分子的脸，你到哪里说理去？</li>
                    <li>某一天，你作为男女主角，出现在了 P 站和 91….</li>
                  </ul>
                  <p>我一开始想的还没这么深，边想边写，写到这，我自己都开始后怕了… 怎么甚至感觉，以后的社会可能会乱套了呢？这可能就是 AI 发展的一个隐患吧。 所以写到最后，虽然这个软件很有意思，但还是劝大家还没有用的就不要用了吧，真的很可怕。同时我也不知道这个软件这样的条款和做法会不会有什么问题，但还是希望能引起有关部门的注意。 以后，也希望大家也可以在使用软件的时候，要更加谨慎和小心，有条款就稍微看一看，尤其是对于这种和用户隐私相关的软件，要更加心存戒备。</p>
                  <h2 id="参考文章"><a href="#参考文章" class="headerlink" title="参考文章"></a>参考文章</h2>
                  <p>本文参考来源：</p>
                  <ul>
                    <li>机器之心：刷屏的 ZAO 换脸 APP 你玩了吗？</li>
                    <li>支付宝推出的刷脸支付是基于“活体检测”技术做支撑</li>
                  </ul>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-09-04 09:40:27" itemprop="dateCreated datePublished" datetime="2019-09-04T09:40:27+08:00">2019-09-04</time>
                </span>
                <span id="/7440.html" class="post-meta-item leancloud_visitors" data-flag-title="谈谈 Zao 这个软件" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>3.5k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>3 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7121.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> 技术杂谈 <i class="label-arrow"></i>
                  </a>
                  <a href="/7121.html" class="post-title-link" itemprop="url">如何学好 MongoDB</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <h1 id="开发者如何学好-MongoDB"><a href="#开发者如何学好-MongoDB" class="headerlink" title="开发者如何学好 MongoDB"></a>开发者如何学好 MongoDB</h1>
                  <p>作为一名研发，数据库是或多或少都会接触到的技术。MongoDB 是热门的 NoSQL 之一，我们怎样才能学好 MongoDB 呢？ 本篇文章，我们将从以下几方面讨论这个话题：</p>
                  <ol>
                    <li><strong>MongoDB 是什么</strong></li>
                    <li><strong>我如何确定我需要学习 MongoDB</strong></li>
                    <li><strong>开发者应该掌握 MongoDB 的哪些知识</strong></li>
                    <li><strong>学习的选择和困境</strong></li>
                  </ol>
                  <p>我们先来了解一下，MongoDB 为何物。 NoSQL 泛指非关系型数据库，该词是关系型数据库（即 SQL）的相对称呼。MongoDB 是非关系型数据库（NoSQL）中较为人熟知的一种。它拥有很多优秀特性，例如高性能、高可用、支持丰富的查询语句、无需预定义数据模型和水平可伸缩等，适合存储结构化、半结构化的文档和特定格式的文档，这些特性使它受到众多开发者的青睐。 <img src="https://ws1.sinaimg.cn/large/007VLJ1tly1g5hu72cwyhj307u020aa4.jpg" alt=""> 我们通过几个例子来看看 MySQL 与 MongoDB 的差异。 与 MySQL 数据库不同的是，MongoDB 不需要预先定义表和字段，这正是它灵活性的体现。MongoDB 可以拥有多个数据库，每个数据库可以拥有多个集合，每个集合可以存储多份文档，这种关系与 SQL 数据库中的“数据库、表、数据”相当。下图描述了 MongoDB 中数据库、集合和文档的关系： <img src="https://images.gitbook.cn/07193ca0-b418-11e9-8eb9-87f7ea48b37f" alt=""> 数据库 <code>fotoo</code> 中有两个集合，它们分别是 <code>player</code> 和 <code>books</code>。每个集合中都包含了许多文档，例如集合 <code>books</code> 中关于书籍《红楼梦》的文档，集合 <code>player</code> 中关于球员 <code>James</code> 的文档。 在查询方面，一个简单的 MySQL 查询语句为 <code>SELECT * FROM tablename</code>，对应的 MongoDB 查询语句为 <code>db.tablename.find()</code>。在面对多步骤的查询条件时，MongoDB 更游刃有余。例如： “统计数据库 <code>artic</code> 中 <code>score</code> 大于 <code>70</code> 且小于 <code>90</code> 的文档数量” 这样的需求，用 MongoDB 的聚合操作就可以轻松完成，对应示例如下：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">\&gt; db.artic.aggregate([</span><br><span class="line">... &#123;<span class="string">$match:</span> &#123;<span class="string">score:</span> &#123;<span class="string">$gt:</span> <span class="number">70</span>, <span class="string">$lt:</span> <span class="number">90</span>&#125;&#125;&#125;,</span><br><span class="line">... &#123;<span class="string">$group:</span> &#123;<span class="string">_id:</span> <span class="literal">null</span>, <span class="string">number:</span> &#123;<span class="string">$sum:</span> <span class="number">1</span>&#125;&#125;&#125;</span><br><span class="line">... ])</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这个例子或许简单了些，在 MySQL 中我们可以用 <code>count</code> 和 <code>where</code> 完成，但如果复杂度再提高四五个等级呢？例如在此基础上增加对某个字段的运算、替换、排序、分组计数、增删字段，用 MySQL 来实现就会很头疼，而 MongoDB 的聚合可以让我们轻松地完成这类复杂需求。</p>
                  <h2 id="我如何确定我需要学习-MongoDB"><a href="#我如何确定我需要学习-MongoDB" class="headerlink" title="我如何确定我需要学习 MongoDB"></a>我如何确定我需要学习 MongoDB</h2>
                  <p>MongoDB 是近些年涌现的几十种 NoSQL 中第一梯队的成员，另外一个为人熟知的是 Redis。你可能会有”我如何确定我需要学习 MongoDB 呢？“ 这样的疑问，面对这个问题，我们可以通过 MongoDB 的特点和应用场景着手。</p>
                  <ul>
                    <li>MongoDB 适合存储结构确定或不确定的文档。例如爬虫爬取的信息常缺失字段的情况或字段参差不齐的情况；</li>
                    <li>对数据库可用性要求较高的情况。MySQL 这类数据库要做到负载均衡、自动容灾和数据同步需要借助外部工具，而 MongoDB 的复制集可以让我们轻松完成这一系列的工作。相对接借助第三方工具来说，复制集的稳定性更高。</li>
                    <li>分库分表是 WEB 开发中常用到的数据库优化手段，MySQL 的分库分表要考虑的问题非常多，例如字段冗余、数据组装跨节点分页、排序和数据迁移等，而 MongoDB 的分片可以让我们轻松完成“分库分表”的工作。MongoDB 的分片机制使我们不必将心思放在由“分库分表”带来的问题，而是专注于具体需求。</li>
                    <li>同样的，MySQL 的权限控制、定义数据模型、数据库备份和恢复等功能在 MongoDB 上也有。</li>
                    <li>MongoDB 中支持地理位置的存储和查询，这意味着 MongoDB 可以用于共享单车、共享雨伞、汽车定位等业务中。</li>
                  </ul>
                  <p>我们常用的关系型数据库无法满足 WEB2.0 时代的需求，在实际应用中暴露了很多难以克服的问题。NoSQL 的产生就是为了解决例如<strong>海量数据的存储</strong>、<strong>弹性可伸缩</strong>和<strong>灵活性</strong>等方面的挑战，所以作为一名合格的开发者，应该抽空学习 SQL 以外的数据库知识。</p>
                  <h2 id="开发者应该掌握-MongoDB-的哪些知识"><a href="#开发者应该掌握-MongoDB-的哪些知识" class="headerlink" title="开发者应该掌握 MongoDB 的哪些知识"></a>开发者应该掌握 MongoDB 的哪些知识</h2>
                  <p>学习前，我们需要明白自身定位：专业 DBA 或者日常开发使用。MongoDB 有完善的培训体系和对应的认证考试，对于希望成为专业 DBA 的朋友我建议到 MongoDB 官方网站了解。而对于仅需要<strong>满足日常开发需求</strong>的朋友，我建议学习的内容如下：</p>
                  <ul>
                    <li>MongoDB 在各个平台的安装方法</li>
                    <li>MongoDB 数据库和集合的基本操作</li>
                    <li>MongoDB 文档 CRUD 操作，包括能够丰富 CRUD 的投影和修饰符等</li>
                    <li>MongoDB 流式聚合操作，这能够在数据库层面轻松完成复杂数据的处理，而不是用编程语言来处理</li>
                    <li>MongoDB 的数据模型，虽然它可以存储不规则的文档，但有些情况下定义数据模型可以提高查询效率</li>
                  </ul>
                  <p>当然，除了这些基本操作之外我们还可以<strong>学习更多的知识提高个人竞争力</strong>，这些知识是：</p>
                  <ul>
                    <li>MongoDB 执行计划和索引，执行计划可以让我们清楚的了解到查询语句的效率，而索引则是优化查询效率的常用手段</li>
                    <li>MongoDB 的复制集，这是提高 MongoDB 可用性，保证数据服务不停机的最佳手段</li>
                    <li>MongoDB 的分片，分片能够在数据量变得庞大之后保证效率</li>
                    <li>MongoDB 的事物，如果你将 MongoDB 用于 WEB 网站，那么事物是你必须学习的知识</li>
                    <li>MongoDB 数据库备份和还原，有了复制集后，备份就显得不是那么重要了，但并不是没有这个需求。而且 MongoDB 的备份可以精细到文档，这就非常有意义了。</li>
                  </ul>
                  <h2 id="学习的选择和困境"><a href="#学习的选择和困境" class="headerlink" title="学习的选择和困境"></a>学习的选择和困境</h2>
                  <p>有一定工作经验的开发者，大多数情况下都会选择自学。有些在网上搜索对应的文章，有些则直接翻阅官方文档。我推荐的方式是翻阅官方文档，在遇到难以理解的观点时通过搜索引擎查找网友分享的文章。 自学的优点很多，缺点也很明显。例如：</p>
                  <ul>
                    <li><strong>断断续续的学习，难以保持专注导致知识吸收不好</strong></li>
                    <li><strong>耗费时间很长，虽然知道应该学习哪些方面的知识，但文档并不是按你所想而规划的，所以翻阅文档要费很多功夫</strong></li>
                    <li><strong>知识不成体系，东看看西看看，没有归纳容易忘记</strong></li>
                    <li><strong>学习就需要记笔记，这又是一件很费时间的事情</strong></li>
                    <li><strong>官方文档有些观点难以理解，卡在半路很难受</strong></li>
                    <li><strong>零星学了一两个月，也不确定学会了没有，内心毫无把握</strong></li>
                  </ul>
                  <p>如果不自学，就得找一些成体系的课程来帮助自己快速进步，少走弯路。知识付费时代，在条件允许的情况下适当地投入也是很好的选择。但面对动辄几百块的视频课程，不少开发者还是感觉略有压力，毕竟我们搬砖的经济压力也非常大。培训班就更不用说了，很少有专业教授单个数据库知识的，而且费用比视频课程更贵。 考虑到这些问题，这里推荐<strong>韦世东</strong>的 GitChat 文章 <strong>《超高性价比的 MongoDB 零基础快速入门实战教程》</strong>，这也是一个收费教程，但它售价不到 10 块钱。文章作者韦世东是：<strong>图灵签约作者、电子工业出版社约稿作者，华为云认证云享专家、掘金社区优秀作者、GitChat 认证作者，开源项目 aiowebsocket 作者</strong>。所以在文章质量上，大家可以放心。 这篇文章的内容几乎囊括了上面我们提到的所有知识点，看完这篇仅 5 万词的文章，你将收获：</p>
                  <ul>
                    <li><strong>文档的 CRUD 操作和 Cursor 对象</strong></li>
                    <li><strong>掌握流式聚合操作，轻松面对任何数据处理需求</strong></li>
                    <li><strong>了解 MongoDB 的查询效率和优化</strong></li>
                    <li><strong>如何提高 MongoDB 的可用性</strong></li>
                    <li><strong>如何应对数据服务故障</strong></li>
                    <li><strong>理解 MongoDB 的访问控制</strong></li>
                    <li><strong>学会用数据模型降低数据冗余，提高效率</strong></li>
                    <li><strong>掌握 mongodump 数据备份与还原方法</strong></li>
                  </ul>
                  <p>这样就可以<strong>胜任日常开发中对数据库操作能力的要求</strong>了。这篇文章适合对 MongoDB 感兴趣的零基础开发者或者有一定基础，想要继续巩固和加深学习的开发者。文章篇幅很长，内容详尽，不乏优质配图，例如描述复制集节点关系的图： <img src="https://ws1.sinaimg.cn/large/007VLJ1tly1g5itduq6uqj30z60jw75s.jpg" alt=""> 描述节主点掉线，重新选举主节点的图 <img src="https://ws1.sinaimg.cn/large/007VLJ1tly1g5itsqu9oqj31880mu0vm.jpg" alt=""> 如果你觉得有学习 MongoDB 的需要，且这篇文章规划的内容是你想要的内容，那么请<strong>长按下方图片识别二维码</strong>，前往订阅文章吧！ <img src="http://ww1.sinaimg.cn/large/006tNc79gy1g5z3r992yqj30ku102k12.jpg" alt=""></p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/韦世东学算法和反爬虫" class="author" itemprop="url" rel="index">韦世东学算法和反爬虫</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-14 20:22:07" itemprop="dateCreated datePublished" datetime="2019-08-14T20:22:07+08:00">2019-08-14</time>
                </span>
                <span id="/7121.html" class="post-meta-item leancloud_visitors" data-flag-title="如何学好 MongoDB" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>3.2k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>3 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7080.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7080.html" class="post-title-link" itemprop="url">利用 Python Faker 包来制作假数据</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>在做程序开发的时候，我们经常会用到一些测试数据，相信大多数同学是这么来造测试数据的：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">test1</span><br><span class="line">test01</span><br><span class="line">test02</span><br><span class="line">测试<span class="number">1</span></span><br><span class="line">测试<span class="number">2</span></span><br><span class="line">测试数据<span class="number">1</span></span><br><span class="line">这是一段测试文本</span><br><span class="line">这是一段很长很长很长的测试文本...</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>中枪的请举手。 不仅要自己手动敲这些测试数据，还敲的这么假。那有啥办法呢？难不成有什么东西能自动给我造点以假乱真的数据啊？你别说，还真有！ 在 Python 中有个神库，叫做 Faker，它可以自动帮我们来生成各种各样的看起来很真的”假“数据，让我们来看看吧！</p>
                  <h2 id="安装"><a href="#安装" class="headerlink" title="安装"></a>安装</h2>
                  <p>首先让我们来看看这个库的安装方法，实际上装起来非常简单，使用 pip 安装即可，Python3 版本的安装命令如下：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">pip3 <span class="keyword">install</span> faker</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>安装好了之后，我们使用最简单的例子来生成几个假数据试试：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">from faker import Faker</span><br><span class="line"></span><br><span class="line">faker = Faker()</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'name:'</span>, faker.name()</span></span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'address:'</span>, faker.address()</span></span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'text:'</span>, faker.text()</span></span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>首先我们从 faker 这个包里面导入一个 Faker 类，然后将其实例化为 faker 对象，依次调用它的 name、address、text 方法，看下运行效果：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="type">name</span>: Nicholas Wilson</span><br><span class="line">address: <span class="number">70561</span> Simmons Road Apt. <span class="number">893</span></span><br><span class="line">Lake Raymondville, HI <span class="number">35240</span></span><br><span class="line"><span class="type">text</span>: <span class="keyword">Both</span> <span class="keyword">begin</span> bring federal space.</span><br><span class="line">Official <span class="keyword">start</span> idea specific. Able under young fire.</span><br><span class="line">Who <span class="keyword">show</span> <span class="type">line</span> traditional easy people. <span class="keyword">Until</span> economic lead event <span class="keyword">case</span>. Technology college his director style.</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>看到这里给我们生成了看起来很真的英文姓名、地址、长文本。 但我们是中国人，我们肯定想要生成中文的吧，不用担心，这个库对非常多的语言都有支持，当然也包括中文了，具体的支持的语言列表可以见：<a href="https://faker.readthedocs.io/en/master/locales.html" target="_blank" rel="noopener">https://faker.readthedocs.io/en/master/locales.html</a>。 这里几个比较常见的语言代号列一下：</p>
                  <ul>
                    <li>简体中文：zh_CN</li>
                    <li>繁体中文：zh_TW</li>
                    <li>美国英文：en_US</li>
                    <li>英国英文：en_GB</li>
                    <li>德文：de_DE</li>
                    <li>日文：ja_JP</li>
                    <li>韩文：ko_KR</li>
                    <li>法文：fr_FR</li>
                  </ul>
                  <p>那么如果要生成中文，只需要在 Faker 类的第一个参数传入对应的语言代号即可，例如简体中文就传入 zh_CN，所以上面的代码改写如下：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">from faker import Faker</span><br><span class="line"></span><br><span class="line">faker = Faker(<span class="string">'zh_CN'</span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'name:'</span>, faker.name()</span></span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'address:'</span>, faker.address()</span></span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'text:'</span>, faker.text()</span></span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果如下：</p>
                  <figure class="highlight erlang">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">name: 何琳</span><br><span class="line">address: 宁夏回族自治区六盘水县南溪北镇街f座 <span class="number">912311</span></span><br><span class="line">text: 经营软件积分开始次数专业.美国留言一种管理人民解决两个.支持只有地方一切.</span><br><span class="line">文化目前东西的是不过所以.系统觉得这种为什一下他们.时候以及这样继续是一状态威望.</span><br><span class="line">网站密码情况.问题一点那个还是.其实过程详细.</span><br><span class="line">中国历史环境电话规定.经验上海控制不要生活.朋友运行项目我们.</span><br><span class="line">以后今天那些使用免费国家加入但是.内容简介空间次数最大一个.日期通过得到日本北京.</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到一段中文的姓名、地址、长文本便生成了。看起来地址是省份、地级市、县级市、街道是随机组合的，文本也是一些随机的词组合而成的，但其实这样已经比文章一开头列的测试数据强太多了。 上面的代码每次运行得到的结果都是不同的，因为生成的结果都是随机组合而成的。</p>
                  <h2 id="Provider"><a href="#Provider" class="headerlink" title="Provider"></a>Provider</h2>
                  <p>接下来让我们详细看下 faker 可以都生成什么类型的数据，具体的可用 API 可以看 <a href="https://faker.readthedocs.io/en/master/locales/zh_CN.html" target="_blank" rel="noopener">https://faker.readthedocs.io/en/master/locales/zh_CN.html</a>，这里面列出来了可用的所有方法。 但打开之后可以发现，这里面多了一个 Provider 对象，那么这个 Provider 是怎么一回事呢？ 实际上这个 faker 库在设计上，为了解耦，将 Provider 对象做成了 Faker 对象的”插件“。Faker 可以添加一个个 Provider 对象，Provider 对象为 Faker 对象提供了生成某项数据的核心实现。就相当于 Faker 对象是一个生成器，它的生成功能依赖于什么呢？依赖于 Provider，是 Provider 提供给了 Faker 对象生成某项数据的能力。 正是因为 Faker 对象内置了一些 Provider 对象，Faker 对象才可以生成刚才所要求的姓名、地址和文本。 那么这时候我们肯定就很好奇了，既然 Faker 对象有生成数据的能力，那么它一定内置了一些默认的 Provider 对象，下面我们来打印看一下：</p>
                  <figure class="highlight isbl">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="variable">from</span> <span class="variable">faker</span> <span class="variable">import</span> <span class="variable">Faker</span></span><br><span class="line"></span><br><span class="line"><span class="variable">faker</span> = <span class="function"><span class="title">Faker</span>(<span class="string">'zh_CN'</span>)</span></span><br><span class="line"><span class="function"><span class="title">print</span>(<span class="variable">faker.providers</span>)</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果如下：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">[&lt;faker<span class="selector-class">.providers</span><span class="selector-class">.user_agent</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249de48&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.ssn</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249dc18&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.python</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249dd68&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.profile</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249dcc0&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.phone_number</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249dc88&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.person</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249de80&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.misc</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249df60&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.lorem</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249dc50&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.job</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249de10&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.isbn</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249c6d8&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.internet</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x10249c828&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.geo</span><span class="selector-class">.en_US</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x102484748&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.file</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x102484828&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.date_time</span><span class="selector-class">.en_US</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x1023789e8&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.currency</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x102484780&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.credit_card</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x1024845f8&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.company</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x102499ef0&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.color</span><span class="selector-class">.en_US</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x1023532e8&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.barcode</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x101cb6d30&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.bank</span><span class="selector-class">.en_GB</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x102378f98&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.automotive</span><span class="selector-class">.en_US</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x1017a5c50&gt;, &lt;faker<span class="selector-class">.providers</span><span class="selector-class">.address</span><span class="selector-class">.zh_CN</span><span class="selector-class">.Provider</span> <span class="selector-tag">object</span> at <span class="number">0</span>x101787c18&gt;]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>还真不少，通过名字可以看到有 user_agent、phone_number、isbn、credit_card 等 Provider，其中具有语言差异化的 Provider 还单独区分了语言，比如 phone_number 代表电话号码，这个不同语言的不同，所以这里就又分了一层 zh_CN，作了语言的区分。 这样一来，通用的 Provider 就直接处在某个 Provider 类别的模块中，具有语言差异的 Provider 就又根据不同的语言进一步划分了模块，设计上非常科学，易扩展又不冗余。 知道了 Faker 具有这么多 Provider 之后，我们来看看刚才调用的 name、address 等方法又和 Provider 有什么关系呢？ 我们将 name、address、text 等方法打印一下看看：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">from faker import Faker</span><br><span class="line"></span><br><span class="line">faker = Faker(<span class="string">'zh_CN'</span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'name:'</span>, faker.name)</span></span></span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'address:'</span>, faker.address)</span></span></span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'text:'</span>, faker.text)</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>注意这里没有调用，而是直接打印了这三个方法，这样可以直接输出方法的对象形式的描述，结果如下：</p>
                  <figure class="highlight oxygene">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">name: &lt;bound <span class="function"><span class="keyword">method</span> <span class="title">Provider</span>.<span class="title">name</span> <span class="title">of</span> &lt;<span class="title">faker</span>.<span class="title">providers</span>.<span class="title">person</span>.<span class="title">zh_CN</span>.<span class="title">Provider</span> <span class="title">object</span> <span class="title">at</span> 0<span class="title">x10f6dea58</span>&gt;&gt;</span></span><br><span class="line"><span class="function"><span class="title">address</span>:</span> &lt;bound <span class="function"><span class="keyword">method</span> <span class="title">Provider</span>.<span class="title">address</span> <span class="title">of</span> &lt;<span class="title">faker</span>.<span class="title">providers</span>.<span class="title">address</span>.<span class="title">zh_CN</span>.<span class="title">Provider</span> <span class="title">object</span> <span class="title">at</span> 0<span class="title">x10e9e6cf8</span>&gt;&gt;</span></span><br><span class="line"><span class="function"><span class="title">text</span>:</span> &lt;bound <span class="function"><span class="keyword">method</span> <span class="title">Provider</span>.<span class="title">text</span> <span class="title">of</span> &lt;<span class="title">faker</span>.<span class="title">providers</span>.<span class="title">lorem</span>.<span class="title">zh_CN</span>.<span class="title">Provider</span> <span class="title">object</span> <span class="title">at</span> 0<span class="title">x10f6dfda0</span>&gt;&gt;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>恍然大悟，原来我们调用的方法就是 Faker 对象调用的 Provider 里面的对应方法，比如 name 就是 faker.providers.person.zh<em>CN.Provider 里面的 name 方法，二者是一致的，我们扒一扒源码验证下，源码在：[[<a href="https://github.com/joke2k/faker/blob/master/faker/providers/person/__init" target="_blank" rel="noopener">https://github.com/joke2k/faker/blob/master/faker/providers/person/__init</a></em>](<a href="https://github.com/joke2k/faker/blob/master/faker/providers/person/__init_)_.py]([https://github.com/joke2k/faker/blob/master/faker/providers/person/__init__.py)，果不其然，里面定义了](https://github.com/joke2k/faker/blob/master/faker/providers/person/__init__.py)，果不其然，里面定义了" target="_blank" rel="noopener">https://github.com/joke2k/faker/blob/master/faker/providers/person/__init_)_.py]([https://github.com/joke2k/faker/blob/master/faker/providers/person/__init__.py)，果不其然，里面定义了](https://github.com/joke2k/faker/blob/master/faker/providers/person/__init__.py)，果不其然，里面定义了</a>) name 方法，然后 Faker 动态地将这个方法引入进来了，就可以使用了。</p>
                  <h2 id="方法列举"><a href="#方法列举" class="headerlink" title="方法列举"></a>方法列举</h2>
                  <p>既然有这么多 Provider，下面我们再详细地看看还有哪些常用的方法吧，下面进行一部分简单的梳理，参考来源文档地址为：<a href="https://faker.readthedocs.io/en/master/providers.html" target="_blank" rel="noopener">https://faker.readthedocs.io/en/master/providers.html</a>。</p>
                  <h3 id="Address"><a href="#Address" class="headerlink" title="Address"></a>Address</h3>
                  <p>Address，用于生成一些和地址相关的数据，如地址、城市、邮政编码、街道等内容， 用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.address()</span><br><span class="line"><span class="meta"># <span class="string">'新疆维吾尔自治区杰县南湖武汉街D座 253105'</span></span></span><br><span class="line">faker.building_number()</span><br><span class="line"><span class="meta"># <span class="string">'B座'</span></span></span><br><span class="line">faker.city()</span><br><span class="line"><span class="meta"># <span class="string">'璐县'</span></span></span><br><span class="line">faker.city_name()</span><br><span class="line"><span class="meta"># <span class="string">'贵阳'</span></span></span><br><span class="line">faker.city_suffix()</span><br><span class="line"><span class="meta"># <span class="string">'县'</span></span></span><br><span class="line">faker.country()</span><br><span class="line"><span class="meta"># <span class="string">'阿拉斯加'</span></span></span><br><span class="line">faker.country_code(representation=<span class="string">"alpha-2"</span>)</span><br><span class="line"><span class="meta"># <span class="string">'CR'</span></span></span><br><span class="line">faker.district()</span><br><span class="line"><span class="meta"># <span class="string">'西峰'</span></span></span><br><span class="line">faker.postcode()</span><br><span class="line"><span class="meta"># <span class="string">'726749'</span></span></span><br><span class="line">faker.province()</span><br><span class="line"><span class="meta"># <span class="string">'福建省'</span></span></span><br><span class="line">faker.street_address()</span><br><span class="line"><span class="meta"># <span class="string">'余路N座'</span></span></span><br><span class="line">faker.street_name()</span><br><span class="line"><span class="meta"># <span class="string">'李路'</span></span></span><br><span class="line">faker.street_suffix()</span><br><span class="line"><span class="meta"># <span class="string">'路'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Color"><a href="#Color" class="headerlink" title="Color"></a>Color</h3>
                  <p>Color，用于生成和颜色相关的数据，如 HEX、RGB、RGBA 等格式的颜色，用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.color_name()</span><br><span class="line"><span class="meta"># <span class="string">'DarkKhaki'</span></span></span><br><span class="line">faker.hex_color()</span><br><span class="line"><span class="meta"># <span class="string">'#97d14e'</span></span></span><br><span class="line">faker.rgb_color()</span><br><span class="line"><span class="meta"># <span class="string">'107,179,51'</span></span></span><br><span class="line">faker.rgb_css_color()</span><br><span class="line"><span class="meta"># <span class="string">'rgb(20,46,70)'</span></span></span><br><span class="line">faker.safe_color_name()</span><br><span class="line"><span class="meta"># <span class="string">'navy'</span></span></span><br><span class="line">faker.safe_hex_color()</span><br><span class="line"><span class="meta"># <span class="string">'#dd2200'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Company"><a href="#Company" class="headerlink" title="Company"></a>Company</h3>
                  <p>Company，用于生成公司相关数据，如公司名、公司前缀、公司后缀等内容，用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.bs()</span><br><span class="line"><span class="meta"># <span class="string">'grow rich initiatives'</span></span></span><br><span class="line">faker.catch_phrase()</span><br><span class="line"><span class="meta"># <span class="string">'Self-enabling encompassing function'</span></span></span><br><span class="line">faker.company()</span><br><span class="line"><span class="meta"># <span class="string">'恒聪百汇网络有限公司'</span></span></span><br><span class="line">faker.company_prefix()</span><br><span class="line"><span class="meta"># <span class="string">'晖来计算机'</span></span></span><br><span class="line">faker.company_suffix()</span><br><span class="line"><span class="meta"># <span class="string">'信息有限公司'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Credit-Card"><a href="#Credit-Card" class="headerlink" title="Credit Card"></a>Credit Card</h3>
                  <p>Credit Card，用于生成信用卡相关数据，如过期时间、银行卡号、安全码等内容，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.credit_card_expire(<span class="attribute">start</span>=<span class="string">"now"</span>, <span class="attribute">end</span>=<span class="string">"+10y"</span>, <span class="attribute">date_format</span>=<span class="string">"%m/%y"</span>)</span><br><span class="line"><span class="comment"># '08/20'</span></span><br><span class="line">faker.credit_card_full(<span class="attribute">card_type</span>=None)</span><br><span class="line"><span class="comment"># 'Mastercardn玉兰 范n5183689713096897 01/25nCVV: 012n'</span></span><br><span class="line">faker.credit_card_number(<span class="attribute">card_type</span>=None)</span><br><span class="line"><span class="comment"># '4009911097184929918'</span></span><br><span class="line">faker.credit_card_provider(<span class="attribute">card_type</span>=None)</span><br><span class="line"><span class="comment"># 'JCB 15 digit'</span></span><br><span class="line">faker.credit_card_security_code(<span class="attribute">card_type</span>=None)</span><br><span class="line"><span class="comment"># '259'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Date-Time"><a href="#Date-Time" class="headerlink" title="Date Time"></a>Date Time</h3>
                  <p>Date Time，用于生成时间相关数据，如年份、月份、星期、出生日期等内容，可以返回 datetime 类型的数据，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.am_pm()</span><br><span class="line"><span class="comment"># 'AM'</span></span><br><span class="line">faker.century()</span><br><span class="line"><span class="comment"># 'X'</span></span><br><span class="line">faker.date(<span class="attribute">pattern</span>=<span class="string">"%Y-%m-%d"</span>, <span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># '1997-06-16'</span></span><br><span class="line">faker.date_between(<span class="attribute">start_date</span>=<span class="string">"-30y"</span>, <span class="attribute">end_date</span>=<span class="string">"today"</span>)</span><br><span class="line"><span class="comment"># datetime.date(2000, 8, 30)</span></span><br><span class="line">faker.date_between_dates(<span class="attribute">date_start</span>=None, <span class="attribute">date_end</span>=None)</span><br><span class="line"><span class="comment"># datetime.date(2019, 7, 30)</span></span><br><span class="line">faker.date_object(<span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># datetime.date(1978, 3, 12)</span></span><br><span class="line">faker.date_of_birth(<span class="attribute">tzinfo</span>=None, <span class="attribute">minimum_age</span>=0, <span class="attribute">maximum_age</span>=115)</span><br><span class="line"><span class="comment"># datetime.date(2012, 6, 3)</span></span><br><span class="line">faker.date_this_century(<span class="attribute">before_today</span>=<span class="literal">True</span>, <span class="attribute">after_today</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># datetime.date(2011, 6, 12)</span></span><br><span class="line">faker.date_this_decade(<span class="attribute">before_today</span>=<span class="literal">True</span>, <span class="attribute">after_today</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># datetime.date(2011, 8, 22)</span></span><br><span class="line">faker.date_this_month(<span class="attribute">before_today</span>=<span class="literal">True</span>, <span class="attribute">after_today</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># datetime.date(2019, 7, 25)</span></span><br><span class="line">faker.date_this_year(<span class="attribute">before_today</span>=<span class="literal">True</span>, <span class="attribute">after_today</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># datetime.date(2019, 7, 22)</span></span><br><span class="line">faker.date_time(<span class="attribute">tzinfo</span>=None, <span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2018, 8, 11, 22, 3, 34)</span></span><br><span class="line">faker.date_time_ad(<span class="attribute">tzinfo</span>=None, <span class="attribute">end_datetime</span>=None, <span class="attribute">start_datetime</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(1566, 8, 26, 16, 25, 30)</span></span><br><span class="line">faker.date_time_between(<span class="attribute">start_date</span>=<span class="string">"-30y"</span>, <span class="attribute">end_date</span>=<span class="string">"now"</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2015, 1, 31, 4, 14, 10)</span></span><br><span class="line">faker.date_time_between_dates(<span class="attribute">datetime_start</span>=None, <span class="attribute">datetime_end</span>=None, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2019, 7, 30, 17, 51, 44)</span></span><br><span class="line">faker.date_time_this_century(<span class="attribute">before_now</span>=<span class="literal">True</span>, <span class="attribute">after_now</span>=<span class="literal">False</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2002, 9, 25, 23, 59, 49)</span></span><br><span class="line">faker.date_time_this_decade(<span class="attribute">before_now</span>=<span class="literal">True</span>, <span class="attribute">after_now</span>=<span class="literal">False</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2010, 5, 25, 20, 20, 52)</span></span><br><span class="line">faker.date_time_this_month(<span class="attribute">before_now</span>=<span class="literal">True</span>, <span class="attribute">after_now</span>=<span class="literal">False</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2019, 7, 19, 18, 4, 6)</span></span><br><span class="line">faker.date_time_this_year(<span class="attribute">before_now</span>=<span class="literal">True</span>, <span class="attribute">after_now</span>=<span class="literal">False</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2019, 3, 15, 11, 4, 18)</span></span><br><span class="line">faker.day_of_month()</span><br><span class="line"><span class="comment"># '04'</span></span><br><span class="line">faker.day_of_week()</span><br><span class="line"><span class="comment"># 'Monday'</span></span><br><span class="line">faker.future_date(<span class="attribute">end_date</span>=<span class="string">"+30d"</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.date(2019, 8, 12)</span></span><br><span class="line">faker.future_datetime(<span class="attribute">end_date</span>=<span class="string">"+30d"</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2019, 8, 24, 2, 59, 4)</span></span><br><span class="line">faker.iso8601(<span class="attribute">tzinfo</span>=None, <span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># '1987-07-01T18:33:56'</span></span><br><span class="line">faker.month()</span><br><span class="line"><span class="comment"># '11'</span></span><br><span class="line">faker.month_name()</span><br><span class="line"><span class="comment"># 'August'</span></span><br><span class="line">faker.past_date(<span class="attribute">start_date</span>=<span class="string">"-30d"</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.date(2019, 7, 25)</span></span><br><span class="line">faker.past_datetime(<span class="attribute">start_date</span>=<span class="string">"-30d"</span>, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># datetime.datetime(2019, 7, 18, 22, 46, 51)</span></span><br><span class="line">faker.time(<span class="attribute">pattern</span>=<span class="string">"%H:%M:%S"</span>, <span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># '16:22:30'</span></span><br><span class="line">faker.time_delta(<span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># datetime.timedelta(0)</span></span><br><span class="line">faker.time_object(<span class="attribute">end_datetime</span>=None)</span><br><span class="line"><span class="comment"># datetime.time(22, 12, 15)</span></span><br><span class="line">faker.time_series(<span class="attribute">start_date</span>=<span class="string">"-30d"</span>, <span class="attribute">end_date</span>=<span class="string">"now"</span>, <span class="attribute">precision</span>=None, <span class="attribute">distrib</span>=None, <span class="attribute">tzinfo</span>=None)</span><br><span class="line"><span class="comment"># &lt;generator object Provider.time_series at 0x7fcbce0604f8&gt;</span></span><br><span class="line">faker.timezone()</span><br><span class="line"><span class="comment"># 'Indian/Comoro'</span></span><br><span class="line">faker.unix_time(<span class="attribute">end_datetime</span>=None, <span class="attribute">start_datetime</span>=None)</span><br><span class="line"><span class="comment"># 1182857626</span></span><br><span class="line">faker.year()</span><br><span class="line"><span class="comment"># '1970'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="File"><a href="#File" class="headerlink" title="File"></a>File</h3>
                  <p>File，用于生成文件和文件路径相关的数据，包括文件扩展名、文件路径、MIME_TYPE、磁盘分区等内容，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.file_extension(<span class="attribute">category</span>=None)</span><br><span class="line"><span class="comment"># 'flac'</span></span><br><span class="line">faker.file_name(<span class="attribute">category</span>=None, <span class="attribute">extension</span>=None)</span><br><span class="line"><span class="comment"># '然后.numbers'</span></span><br><span class="line">faker.file_path(<span class="attribute">depth</span>=1, <span class="attribute">category</span>=None, <span class="attribute">extension</span>=None)</span><br><span class="line"><span class="comment"># '/关系/科技.mov'</span></span><br><span class="line">faker.mime_type(<span class="attribute">category</span>=None)</span><br><span class="line"><span class="comment"># 'video/ogg'</span></span><br><span class="line">faker.unix_device(<span class="attribute">prefix</span>=None)</span><br><span class="line"><span class="comment"># '/dev/sdd'</span></span><br><span class="line">faker.unix_partition(<span class="attribute">prefix</span>=None)</span><br><span class="line"><span class="comment"># '/dev/xvds3'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Geo"><a href="#Geo" class="headerlink" title="Geo"></a>Geo</h3>
                  <p>Geo，用于生成和地理位置相关的数据，包括经纬度，时区等等信息，用法如下：</p>
                  <figure class="highlight less">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.coordinate</span>(center=None, radius=<span class="number">0.001</span>)</span><br><span class="line"># <span class="selector-tag">Decimal</span>(<span class="string">'-114.420686'</span>)</span><br><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.latitude</span>()</span><br><span class="line"># <span class="selector-tag">Decimal</span>(<span class="string">'-9.772541'</span>)</span><br><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.latlng</span>()</span><br><span class="line"># (Decimal(<span class="string">'-27.0730915'</span>), Decimal(<span class="string">'-5.919460'</span>))</span><br><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.local_latlng</span>(country_code=<span class="string">"US"</span>, coords_only=False)</span><br><span class="line"># (<span class="string">'41.47892'</span>, <span class="string">'-87.45476'</span>, <span class="string">'Schererville'</span>, <span class="string">'US'</span>, <span class="string">'America/Chicago'</span>)</span><br><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.location_on_land</span>(coords_only=False)</span><br><span class="line"># (<span class="string">'12.74482'</span>, <span class="string">'4.52514'</span>, <span class="string">'Argungu'</span>, <span class="string">'NG'</span>, <span class="string">'Africa/Lagos'</span>)</span><br><span class="line"><span class="selector-tag">faker</span><span class="selector-class">.longitude</span>()</span><br><span class="line"># <span class="selector-tag">Decimal</span>(<span class="string">'40.885895'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Internet"><a href="#Internet" class="headerlink" title="Internet"></a>Internet</h3>
                  <p>Internet，用于生成和互联网相关的数据，包括随机电子邮箱、域名、IP 地址、URL、用户名、后缀名等内容，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.ascii_company_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'xuna@xiaqian.cn'</span></span><br><span class="line">faker.ascii_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'min59@60.cn'</span></span><br><span class="line">faker.ascii_free_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'min75@gmail.com'</span></span><br><span class="line">faker.ascii_safe_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'cliu@example.com'</span></span><br><span class="line">faker.company_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'ilong@99.cn'</span></span><br><span class="line">faker.domain_name(<span class="attribute">levels</span>=1)</span><br><span class="line"><span class="comment"># 'xiulan.cn'</span></span><br><span class="line">faker.domain_word(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'luo'</span></span><br><span class="line">faker.email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'maoxiulan@hotmail.com'</span></span><br><span class="line">faker.free_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'yanshen@gmail.com'</span></span><br><span class="line">faker.free_email_domain(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'yahoo.com'</span></span><br><span class="line">faker.hostname(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'lt-18.pan.cn'</span></span><br><span class="line">faker.image_url(<span class="attribute">width</span>=None, <span class="attribute">height</span>=None)</span><br><span class="line"><span class="comment"># 'https://placekitten.com/51/201'</span></span><br><span class="line">faker.ipv4(<span class="attribute">network</span>=<span class="literal">False</span>, <span class="attribute">address_class</span>=None, <span class="attribute">private</span>=None)</span><br><span class="line"><span class="comment"># '192.233.68.5'</span></span><br><span class="line">faker.ipv4_network_class()</span><br><span class="line"><span class="comment"># 'a'</span></span><br><span class="line">faker.ipv4_private(<span class="attribute">network</span>=<span class="literal">False</span>, <span class="attribute">address_class</span>=None)</span><br><span class="line"><span class="comment"># '10.9.97.93'</span></span><br><span class="line">faker.ipv4_public(<span class="attribute">network</span>=<span class="literal">False</span>, <span class="attribute">address_class</span>=None)</span><br><span class="line"><span class="comment"># '192.51.22.7'</span></span><br><span class="line">faker.ipv6(<span class="attribute">network</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># 'de57:9c6f:a38c:9864:10ec:6442:775d:5f02'</span></span><br><span class="line">faker.mac_address()</span><br><span class="line"><span class="comment"># '99:80:5c:ab:8c:a9'</span></span><br><span class="line">faker.safe_email(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'tangjuan@example.net'</span></span><br><span class="line">faker.slug(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># ''</span></span><br><span class="line">faker.tld()</span><br><span class="line"><span class="comment"># 'cn'</span></span><br><span class="line">faker.uri()</span><br><span class="line"><span class="comment"># 'http://fangfan.org/app/tag/post/'</span></span><br><span class="line">faker.uri_extension()</span><br><span class="line"><span class="comment"># '.php'</span></span><br><span class="line">faker.uri_page()</span><br><span class="line"><span class="comment"># 'about'</span></span><br><span class="line">faker.uri_path(<span class="attribute">deep</span>=None)</span><br><span class="line"><span class="comment"># 'app'</span></span><br><span class="line">faker.url(<span class="attribute">schemes</span>=None)</span><br><span class="line"><span class="comment"># 'http://mingli.cn/'</span></span><br><span class="line">faker.user_name(<span class="number">*a</span>rgs, **kwargs)</span><br><span class="line"><span class="comment"># 'jie54'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Job"><a href="#Job" class="headerlink" title="Job"></a>Job</h3>
                  <p>Job，用于生成和职业相关的数据，用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.job()</span><br><span class="line"><span class="meta"># <span class="string">'烫工'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Lorem"><a href="#Lorem" class="headerlink" title="Lorem"></a>Lorem</h3>
                  <p>Lorem，用于生成一些假文字数据，包括句子、自然段、长文本、关键词等，另外可以传入不同的参数来控制生成的长度，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.paragraph(<span class="attribute">nb_sentences</span>=3, <span class="attribute">variable_nb_sentences</span>=<span class="literal">True</span>, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># '包括的是报告那些一点.图片地址基本全部.'</span></span><br><span class="line">faker.paragraphs(<span class="attribute">nb</span>=3, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># [   '计划规定这样所以组织商品其中.参加成为不同发表地区.精华科技谢谢大家需要.一下手机上海中文工程.',</span></span><br><span class="line"><span class="comment">#     '非常相关是一就是一个一种文章发生.增加那些以及之后以下你的.',</span></span><br><span class="line"><span class="comment">#     '学生应该出来分析增加关系组织.评论来源朋友注册应该需要单位.感觉最后无法发现选择人民.']</span></span><br><span class="line">faker.sentence(<span class="attribute">nb_words</span>=6, <span class="attribute">variable_nb_words</span>=<span class="literal">True</span>, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># '介绍结果自己解决处理.'</span></span><br><span class="line">faker.sentences(<span class="attribute">nb</span>=3, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># ['查看其实一次学习登录浏览是一他们.', '而且资源的人事情.', '科技价格免费大学教育.']</span></span><br><span class="line">faker.text(<span class="attribute">max_nb_chars</span>=200, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># ('只是当前国内中文所以.威望系统在线虽然.n'</span></span><br><span class="line"><span class="comment">#  '图片人民非常合作这种谢谢更新.名称详细直接社会一直首页完全.n'</span></span><br><span class="line"><span class="comment">#  '重要更多只要市场.必须只是学生音乐.系统美国类别这些一切环境.n'</span></span><br><span class="line"><span class="comment">#  '但是的话人民美国关于.n'</span></span><br><span class="line"><span class="comment">#  '情况专业国际看到研究.音乐环境市场搜索发现.n'</span></span><br><span class="line"><span class="comment">#  '工具还是到了今天位置人民.留言作者品牌工程项目必须.上海精华现在我们新闻应该关系.n'</span></span><br><span class="line"><span class="comment">#  '更新经济能力全部资源如果.手机能够登录国内.')</span></span><br><span class="line">faker.texts(<span class="attribute">nb_texts</span>=3, <span class="attribute">max_nb_chars</span>=200, <span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># [   '成功可能推荐你的行业.地区而且推荐.n'</span></span><br><span class="line"><span class="comment">#     '网络不断是一主要必须.开始安全服务.n'</span></span><br><span class="line"><span class="comment">#     '应该网上通过以后通过大学.管理要求有关国际阅读当前.为了应该结果点击公司开始怎么.n'</span></span><br><span class="line"><span class="comment">#     '成功一次最大生产网站.这种加入她的地址有限.n'</span></span><br><span class="line"><span class="comment">#     '根据新闻汽车起来非常主题显示必须.有些建设来自作者电话支持.n'</span></span><br><span class="line"><span class="comment">#     '只是资源还是由于经济事情喜欢.为什中文大小得到服务.网络密码是否免费参加一次社区欢迎.',</span></span><br><span class="line"><span class="comment">#     '部门活动技术.商品影响发生行业密码完成.就是部门结果资料学习当然.或者帮助城市要求首页市场教育你们.n'</span></span><br><span class="line"><span class="comment">#     '专业完全分析处理城市大学什么.n'</span></span><br><span class="line"><span class="comment">#     '文件非常国际全部起来积分公司.资料的是电影没有.这是本站需要.n'</span></span><br><span class="line"><span class="comment">#     '合作重要没有现在市场开发空间.您的会员推荐成功教育进行中国.n'</span></span><br><span class="line"><span class="comment">#     '文件不是如果评论.因为经验设备规定.n'</span></span><br><span class="line"><span class="comment">#     '加入一起影响网上大家运行在线如果.工程企业这种以后.',</span></span><br><span class="line"><span class="comment">#     '空间市场出现必须基本电话.显示一个标准其他设计作品.工程不断新闻问题更多更新这么.n'</span></span><br><span class="line"><span class="comment">#     '一起简介网上内容不会.任何知道各种两个.类别事情经营那么投资市场.n'</span></span><br><span class="line"><span class="comment">#     '那些使用介绍公司朋友人民你们浏览.应该表示一点一般说明主要谢谢.电话回复起来经验一个来源加入.n'</span></span><br><span class="line"><span class="comment">#     '地区法律其他表示虽然.参加社会喜欢有限论坛一般发布.类别目前文化可以.n'</span></span><br><span class="line"><span class="comment">#     '报告质量工作主要.企业发布完全.得到名称作者等级两个论坛只要电话.']</span></span><br><span class="line">faker.word(<span class="attribute">ext_word_list</span>=None)</span><br><span class="line"><span class="comment"># '注意'</span></span><br><span class="line">faker.words(<span class="attribute">nb</span>=3, <span class="attribute">ext_word_list</span>=None, <span class="attribute">unique</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># ['责任', '组织', '以后']</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里每个方法的参数是不同的，具体的参数解释可以见源代码每个方法的注释：<a href="https://github.com/joke2k/faker/blob/master/faker/providers/lorem/__init__.py" target="_blank" rel="noopener">https://github.com/joke2k/faker/blob/master/faker/providers/lorem/<strong>init</strong>.py</a>，</p>
                  <h3 id="Misc"><a href="#Misc" class="headerlink" title="Misc"></a>Misc</h3>
                  <p>Misc，用于生成生成一些混淆数据，比如密码、sha1、sha256、md5 等加密后的内容，用法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.boolean(<span class="attribute">chance_of_getting_true</span>=50)</span><br><span class="line"><span class="comment"># True</span></span><br><span class="line">faker.md5(<span class="attribute">raw_output</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># '3166fa26ffd3f2a33e020dfe11191ac6'</span></span><br><span class="line">faker.null_boolean()</span><br><span class="line"><span class="comment"># False</span></span><br><span class="line">faker.password(<span class="attribute">length</span>=10, <span class="attribute">special_chars</span>=<span class="literal">True</span>, <span class="attribute">digits</span>=<span class="literal">True</span>, <span class="attribute">upper_case</span>=<span class="literal">True</span>, <span class="attribute">lower_case</span>=<span class="literal">True</span>)</span><br><span class="line"><span class="comment"># 'W7Ln8La@%O'</span></span><br><span class="line">faker.sha1(<span class="attribute">raw_output</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># 'c8301a2a79445439ee5287f38053e4b3a05eac79'</span></span><br><span class="line">faker.sha256(<span class="attribute">raw_output</span>=<span class="literal">False</span>)</span><br><span class="line"><span class="comment"># '1e909d331e20cf241aaa2da894deae5a3a75e5cdc35c053422d9b8e7ccfa0402'</span></span><br><span class="line">faker.uuid4(<span class="attribute">cast_to</span>=&lt;class <span class="string">'str'</span>&gt;)</span><br><span class="line"><span class="comment"># '6e6fe387-6877-48d9-94ea-4263c4c71aa5'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Person"><a href="#Person" class="headerlink" title="Person"></a>Person</h3>
                  <p>Person，用于生成和人名相关的数据，包括姓氏、名字、全名、英文名等内容，还能区分男女名字，用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.first_name()</span><br><span class="line"><span class="meta"># <span class="string">'颖'</span></span></span><br><span class="line">faker.first_name_female()</span><br><span class="line"><span class="meta"># <span class="string">'芳'</span></span></span><br><span class="line">faker.first_name_male()</span><br><span class="line"><span class="meta"># <span class="string">'利'</span></span></span><br><span class="line">faker.first_romanized_name()</span><br><span class="line"><span class="meta"># <span class="string">'Jing'</span></span></span><br><span class="line">faker.last_name()</span><br><span class="line"><span class="meta"># <span class="string">'温'</span></span></span><br><span class="line">faker.last_name_female()</span><br><span class="line"><span class="meta"># <span class="string">'寇'</span></span></span><br><span class="line">faker.last_name_male()</span><br><span class="line"><span class="meta"># <span class="string">'陈'</span></span></span><br><span class="line">faker.last_romanized_name()</span><br><span class="line"><span class="meta"># <span class="string">'Lei'</span></span></span><br><span class="line">faker.name()</span><br><span class="line"><span class="meta"># <span class="string">'黄明'</span></span></span><br><span class="line">faker.name_female()</span><br><span class="line"><span class="meta"># <span class="string">'张凯'</span></span></span><br><span class="line">faker.name_male()</span><br><span class="line"><span class="meta"># <span class="string">'黄鹏'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="User-Agent"><a href="#User-Agent" class="headerlink" title="User-Agent"></a>User-Agent</h3>
                  <p>User-Agent，用于生成和浏览器 User-Agent 相关的内容，可以定制各种浏览器，还可以传入版本信息来控制生成的内容，用法如下：</p>
                  <figure class="highlight autoit">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">faker.chrome(version_from=<span class="number">13</span>, version_to=<span class="number">63</span>, build_from=<span class="number">800</span>, build_to=<span class="number">899</span>)</span><br><span class="line"><span class="meta"># (<span class="string">'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5332 (KHTML, like Gecko) '</span></span></span><br><span class="line"><span class="meta">#  <span class="string">'Chrome/40.0.837.0 Safari/5332'</span>)</span></span><br><span class="line">faker.firefox()</span><br><span class="line"><span class="meta"># (<span class="string">'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_8_9; rv:1.9.4.20) '</span></span></span><br><span class="line"><span class="meta">#  <span class="string">'Gecko/2019-05-02 05:58:44 Firefox/3.6.19'</span>)</span></span><br><span class="line">faker.internet_explorer()</span><br><span class="line"><span class="meta"># <span class="string">'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/3.0)'</span></span></span><br><span class="line">faker.linux_platform_token()</span><br><span class="line"><span class="meta"># <span class="string">'X11; Linux i686'</span></span></span><br><span class="line">faker.linux_processor()</span><br><span class="line"><span class="meta"># <span class="string">'x86_64'</span></span></span><br><span class="line">faker.mac_platform_token()</span><br><span class="line"><span class="meta"># <span class="string">'Macintosh; U; PPC Mac OS X 10_12_5'</span></span></span><br><span class="line">faker.mac_processor()</span><br><span class="line"><span class="meta"># <span class="string">'U; Intel'</span></span></span><br><span class="line">faker.opera()</span><br><span class="line"><span class="meta"># <span class="string">'Opera/9.77.(Windows NT 4.0; vi-VN) Presto/2.9.182 Version/11.00'</span></span></span><br><span class="line">faker.safari()</span><br><span class="line"><span class="meta"># (<span class="string">'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_7_1 rv:5.0; or-IN) '</span></span></span><br><span class="line"><span class="meta">#  <span class="string">'AppleWebKit/535.9.4 (KHTML, like Gecko) Version/5.0.2 Safari/535.9.4'</span>)</span></span><br><span class="line">faker.user_agent()</span><br><span class="line"><span class="meta"># <span class="string">'Opera/8.69.(X11; Linux i686; ml-IN) Presto/2.9.170 Version/11.00'</span></span></span><br><span class="line">faker.windows_platform_token()</span><br><span class="line"><span class="meta"># <span class="string">'Windows NT 6.1'</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>以上仅仅列了一部分，还有更多的功能大家可以查看官方文档的内容，链接为：<a href="https://faker.readthedocs.io/en/master/locales/zh_CN.html" target="_blank" rel="noopener">https://faker.readthedocs.io/en/master/locales/zh_CN.html</a>。</p>
                  <h2 id="其他-Provider"><a href="#其他-Provider" class="headerlink" title="其他 Provider"></a>其他 Provider</h2>
                  <p>另外还有一些社区贡献的 Provider，如 WiFi、微服务相关的，大家可以查看文档的说明，另外需要额外安装这些扩展包并自行添加 Provider，文档见：<a href="https://faker.readthedocs.io/en/master/communityproviders.html" target="_blank" rel="noopener">https://faker.readthedocs.io/en/master/communityproviders.html</a>。 添加 Provider 需要调用 add_provider 方法，用法示例如下：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> faker <span class="keyword">import</span> Faker</span><br><span class="line"><span class="keyword">from</span> faker.providers <span class="keyword">import</span> <span class="built_in">int</span>ernet</span><br><span class="line"></span><br><span class="line">faker = Faker()</span><br><span class="line">faker.add_provider(<span class="built_in">int</span>ernet)</span><br><span class="line">print(faker.ipv4_private())</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>还有更多的内容大家可以参考官方文档，链接：<a href="https://faker.readthedocs.io/" target="_blank" rel="noopener">https://faker.readthedocs.io/</a>。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-06 12:24:18" itemprop="dateCreated datePublished" datetime="2019-08-06T12:24:18+08:00">2019-08-06</time>
                </span>
                <span id="/7080.html" class="post-meta-item leancloud_visitors" data-flag-title="利用 Python Faker 包来制作假数据" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>15k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>14 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7071.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7071.html" class="post-title-link" itemprop="url">Python 中 typing 模块和类型注解的使用</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <h2 id="实例引入"><a href="#实例引入" class="headerlink" title="实例引入"></a>实例引入</h2>
                  <p>我们知道 Python 是一种动态语言，在声明一个变量时我们不需要显式地声明它的类型，例如下面的例子：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-tag">a</span> = <span class="number">2</span></span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'1 + a ='</span>, <span class="number">1</span> + a)</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果：</p>
                  <figure class="highlight basic">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">1 </span>+ a = <span class="number">3</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里我们首先声明了一个变量 <code>a</code>，并将其赋值为了 2，然后将最后的结果打印出来，程序输出来了正确的结果。但在这个过程中，我们没有声明它到底是什么类型。 但如果这时候我们将 <code>a</code> 变成一个字符串类型，结果会是怎样的呢？改写如下：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="selector-tag">a</span> = <span class="string">'2'</span></span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(<span class="string">'1 + a ='</span>, <span class="number">1</span> + a)</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果：</p>
                  <figure class="highlight scala">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="type">TypeError</span>: unsupported operand <span class="class"><span class="keyword">type</span>(<span class="params">s</span>) <span class="title">for</span> <span class="title">+</span></span>: <span class="symbol">'in</span>t' and <span class="symbol">'st</span>r'</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>直接报错了，错误原因是我们进行了字符串类型的变量和数值类型变量的加和，两种数据类型不同，是无法进行相加的。 如果我们将上面的语句改写成一个方法定义：</p>
                  <figure class="highlight ruby">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">add</span><span class="params">(a)</span></span><span class="symbol">:</span></span><br><span class="line">    <span class="keyword">return</span> a + <span class="number">1</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里定义了一个方法，传入一个参数，然后将其加 1 并返回。 如果这时候如果用下面的方式调用，传入的参数是一个数值类型：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="title">add</span><span class="params">(<span class="number">2</span>)</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>则可以正常输出结果 3。但如果我们传入的参数并不是我们期望的类型，比如传入一个字符类型，那么就会同样报刚才类似的错误。 但又由于 Python 的特性，很多情况下我们并不用去声明它的类型，因此从方法定义上面来看，我们实际上是不知道一个方法的参数到底应该传入什么类型的。 这样其实就造成了很多不方便的地方，在某些情况下一些复杂的方法，如果不借助于一些额外的说明，我们是不知道参数到底是什么类型的。 因此，Python 中的类型注解就显得比较重要了。</p>
                  <h2 id="类型注解"><a href="#类型注解" class="headerlink" title="类型注解"></a>类型注解</h2>
                  <p>在 Python 3.5 中，Python PEP 484 引入了类型注解（type hints），在 Python 3.6 中，PEP 526 又进一步引入了变量注解（Variable Annotations），所以上面的代码我们改写成如下写法：</p>
                  <figure class="highlight vim">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="variable">a:</span> <span class="keyword">int</span> = <span class="number">2</span></span><br><span class="line"><span class="keyword">print</span>(<span class="string">'5 + a ='</span>, <span class="number">5</span> + <span class="keyword">a</span>)</span><br><span class="line"></span><br><span class="line">def <span class="built_in">add</span>(<span class="variable">a:</span> <span class="keyword">int</span>) -&gt; in<span class="variable">t:</span></span><br><span class="line">    <span class="keyword">return</span> <span class="keyword">a</span> + <span class="number">1</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>具体的语法是可以归纳为两点：</p>
                  <ul>
                    <li>在声明变量时，变量的后面可以加一个冒号，后面再写上变量的类型，如 int、list 等等。</li>
                    <li>在声明方法返回值的时候，可以在方法的后面加一个箭头，后面加上返回值的类型，如 int、list 等等。</li>
                  </ul>
                  <p>在 <a href="https://www.python.org/dev/peps/pep-0008/#other-recommendations" target="_blank" rel="noopener">PEP 8</a> 中，具体的格式是这样规定的：</p>
                  <ul>
                    <li>在声明变量类型时，变量后方紧跟一个冒号，冒号后面跟一个空格，再跟上变量的类型。</li>
                    <li>在声明方法返回值的时候，箭头左边是方法定义，箭头右边是返回值的类型，箭头左右两边都要留有空格。</li>
                  </ul>
                  <p>有了这样的声明，以后我们如果看到这个方法的定义，我们就知道传入的参数类型了，如调用 add 方法的时候，我们就知道传入的需要是一个数值类型的变量，而不是字符串类型，非常直观。 但值得注意的是，这种类型和变量注解实际上只是一种类型提示，对运行实际上是没有影响的，比如调用 add 方法的时候，我们传入的不是 int 类型，而是一个 float 类型，它也不会报错，也不会对参数进行类型转换，如：</p>
                  <figure class="highlight stylus">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="title">add</span><span class="params">(<span class="number">1.5</span>)</span></span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>我们传入的是一个 float 类型的数值 1.5，看下运行结果：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="number">2.5</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到，运行结果正常输出，而且 1.5 并没有经过强制类型转换变成 1，否则结果会变成 2。 因此，类型和变量注解只是提供了一种提示，对于运行实际上没有任何影响。 不过有了类型注解，一些 IDE 是可以识别出来并提示的，比如 PyCharm 就可以识别出来在调用某个方法的时候参数类型不一致，会提示 WARNING。 比如上面的调用，如果在 PyCharm 中，就会有如下提示内容：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Expected<span class="built_in"> type </span><span class="string">'int'</span>, got <span class="string">'float'</span> instead</span><br><span class="line">This inspection detects<span class="built_in"> type </span>errors <span class="keyword">in</span> function call expressions. Due <span class="keyword">to</span> dynamic dispatch <span class="keyword">and</span> duck typing, this is possible <span class="keyword">in</span> a limited but useful number of cases. Types of function parameters can be specified <span class="keyword">in</span> docstrings <span class="keyword">or</span> <span class="keyword">in</span> Python 3 function annotations.</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>另外也有一些库是支持类型检查的，比如 mypy，安装之后，利用 mypy 即可检查出 Python 脚本中不符合类型注解的调用情况。 上面只是用一个简单的 int 类型做了实例，下面我们再看下一些相对复杂的数据结构，例如列表、元组、字典等类型怎么样来声明。 可想而知了，列表用 list 表示，元组用 tuple 表示，字典用 dict 来表示，那么很自然地，在声明的时候我们就很自然地写成这样了：</p>
                  <figure class="highlight yaml">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">names:</span> <span class="string">list</span> <span class="string">=</span> <span class="string">['Germey',</span> <span class="string">'Guido'</span><span class="string">]</span></span><br><span class="line"><span class="attr">version:</span> <span class="string">tuple</span> <span class="string">=</span> <span class="string">(3,</span> <span class="number">7</span><span class="string">,</span> <span class="number">4</span><span class="string">)</span></span><br><span class="line"><span class="attr">operations:</span> <span class="string">dict</span> <span class="string">=</span> <span class="string">&#123;'show':</span> <span class="literal">False</span><span class="string">,</span> <span class="attr">'sort':</span> <span class="literal">True</span><span class="string">&#125;</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这么看上去没有问题，确实声明为了对应的类型，但实际上并不能反映整个列表、元组的结构，比如我们只通过类型注解是不知道 names 里面的元素是什么类型的，只知道 names 是一个列表 list 类型，实际上里面都是字符串 str 类型。我们也不知道 version 这个元组的每一个元素是什么类型的，实际上是 int 类型。但这些信息我们都无从得知。因此说，仅仅凭借 list、tuple 这样的声明是非常“弱”的，我们需要一种更强的类型声明。 这时候我们就需要借助于 typing 模块了，它提供了非常“强“的类型支持，比如 <code>List[str]</code>、<code>Tuple[int, int, int]</code> 则可以表示由 str 类型的元素组成的列表和由 int 类型的元素组成的长度为 3 的元组。所以上文的声明写法可以改写成下面的样子：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> typing <span class="keyword">import</span> List, Tuple, Dict</span><br><span class="line"></span><br><span class="line">names: List[str] = [<span class="string">'Germey'</span>, <span class="string">'Guido'</span>]</span><br><span class="line"><span class="keyword">version</span>: Tuple[<span class="type">int</span>, <span class="type">int</span>, <span class="type">int</span>] = (<span class="number">3</span>, <span class="number">7</span>, <span class="number">4</span>)</span><br><span class="line">operations: Dict[str, <span class="type">bool</span>] = &#123;<span class="string">'show'</span>: <span class="keyword">False</span>, <span class="string">'sort'</span>: <span class="keyword">True</span>&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样一来，变量的类型便可以非常直观地体现出来了。 目前 typing 模块也已经被加入到 Python 标准库中，不需要安装第三方模块，我们就可以直接使用了。</p>
                  <h2 id="typing"><a href="#typing" class="headerlink" title="typing"></a>typing</h2>
                  <p>下面我们再来详细看下 typing 模块的具体用法，这里主要会介绍一些常用的注解类型，如 List、Tuple、Dict、Sequence 等等，了解了每个类型的具体使用方法，我们可以得心应手的对任何变量进行声明了。 在引入的时候就直接通过 typing 模块引入就好了，例如：</p>
                  <figure class="highlight capnproto">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> typing <span class="keyword">import</span> <span class="built_in">List</span>, Tuple</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="List"><a href="#List" class="headerlink" title="List"></a>List</h3>
                  <p>List、列表，是 list 的泛型，基本等同于 list，其后紧跟一个方括号，里面代表了构成这个列表的元素类型，如由数字构成的列表可以声明为：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">var: List[<span class="built_in">int</span> <span class="keyword">or</span> <span class="built_in">float</span>] = [<span class="number">2</span>, <span class="number">3.5</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>另外还可以嵌套声明都是可以的：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">var: List[List[<span class="built_in">int</span>]] = [[<span class="number">1</span>, <span class="number">2</span>], [<span class="number">2</span>, <span class="number">3</span>]]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Tuple、NamedTuple"><a href="#Tuple、NamedTuple" class="headerlink" title="Tuple、NamedTuple"></a>Tuple、NamedTuple</h3>
                  <p>Tuple、元组，是 tuple 的泛型，其后紧跟一个方括号，方括号中按照顺序声明了构成本元组的元素类型，如 <code>Tuple[X, Y]</code> 代表了构成元组的第一个元素是 X 类型，第二个元素是 Y 类型。 比如想声明一个元组，分别代表姓名、年龄、身高，三个数据类型分别为 str、int、float，那么可以这么声明：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">person: Tuple[str, <span class="built_in">int</span>, <span class="built_in">float</span>] = (<span class="string">'Mike'</span>, <span class="number">22</span>, <span class="number">1.75</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>同样地也可以使用类型嵌套。 NamedTuple，是 collections.namedtuple 的泛型，实际上就和 namedtuple 用法完全一致，但个人其实并不推荐使用 NamedTuple，推荐使用 attrs 这个库来声明一些具有表征意义的类。</p>
                  <h3 id="Dict、Mapping、MutableMapping"><a href="#Dict、Mapping、MutableMapping" class="headerlink" title="Dict、Mapping、MutableMapping"></a>Dict、Mapping、MutableMapping</h3>
                  <p>Dict、字典，是 dict 的泛型；Mapping，映射，是 collections.abc.Mapping 的泛型。根据官方文档，Dict 推荐用于注解返回类型，Mapping 推荐用于注解参数。它们的使用方法都是一样的，其后跟一个中括号，中括号内分别声明键名、键值的类型，如：</p>
                  <figure class="highlight processing">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def <span class="built_in">size</span>(<span class="built_in">rect</span>: Mapping[<span class="built_in">str</span>, <span class="built_in">int</span>]) -&gt; Dict[<span class="built_in">str</span>, <span class="built_in">int</span>]:</span><br><span class="line">    <span class="keyword">return</span> &#123;<span class="string">'width'</span>: <span class="built_in">rect</span>[<span class="string">'width'</span>] + <span class="number">100</span>, <span class="string">'height'</span>: <span class="built_in">rect</span>[<span class="string">'width'</span>] + <span class="number">100</span>&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里将 Dict 用作了返回值类型注解，将 Mapping 用作了参数类型注解。 MutableMapping 则是 Mapping 对象的子类，在很多库中也经常用 MutableMapping 来代替 Mapping。</p>
                  <h3 id="Set、AbstractSet"><a href="#Set、AbstractSet" class="headerlink" title="Set、AbstractSet"></a>Set、AbstractSet</h3>
                  <p>Set、集合，是 set 的泛型；AbstractSet、是 collections.abc.Set 的泛型。根据官方文档，Set 推荐用于注解返回类型，AbstractSet 用于注解参数。它们的使用方法都是一样的，其后跟一个中括号，里面声明集合中元素的类型，如：</p>
                  <figure class="highlight sql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def <span class="keyword">describe</span>(s: AbstractSet[<span class="built_in">int</span>]) -&gt; <span class="keyword">Set</span>[<span class="built_in">int</span>]:</span><br><span class="line">    <span class="keyword">return</span> <span class="keyword">set</span>(s)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里将 Set 用作了返回值类型注解，将 AbstractSet 用作了参数类型注解。</p>
                  <h3 id="Sequence"><a href="#Sequence" class="headerlink" title="Sequence"></a>Sequence</h3>
                  <p>Sequence，是 collections.abc.Sequence 的泛型，在某些情况下，我们可能并不需要严格区分一个变量或参数到底是列表 list 类型还是元组 tuple 类型，我们可以使用一个更为泛化的类型，叫做 Sequence，其用法类似于 List，如：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">def</span> square(<span class="string">elements:</span> Sequence[<span class="keyword">float</span>]) -&gt; List[<span class="keyword">float</span>]:</span><br><span class="line">    <span class="keyword">return</span> [x ** <span class="number">2</span> <span class="keyword">for</span> x <span class="keyword">in</span> elements]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="NoReturn"><a href="#NoReturn" class="headerlink" title="NoReturn"></a>NoReturn</h3>
                  <p>NoReturn，当一个方法没有返回结果时，为了注解它的返回类型，我们可以将其注解为 NoReturn，例如：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">hello</span><span class="params">()</span> -&gt; NoReturn:</span></span><br><span class="line">    print(<span class="string">'hello'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Any"><a href="#Any" class="headerlink" title="Any"></a>Any</h3>
                  <p>Any，是一种特殊的类型，它可以代表所有类型，静态类型检查器的所有类型都与 Any 类型兼容，所有的无参数类型注解和返回类型注解的都会默认使用 Any 类型，也就是说，下面两个方法的声明是完全等价的：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">add</span><span class="params">(a)</span>:</span></span><br><span class="line">    <span class="keyword">return</span> a + <span class="number">1</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">add</span><span class="params">(a: Any)</span> -&gt; Any:</span></span><br><span class="line">    <span class="keyword">return</span> a + <span class="number">1</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>原理类似于 object，所有的类型都是 object 的子类。但如果我们将参数声明为 object 类型，静态参数类型检查便会抛出错误，而 Any 则不会，具体可以参考官方文档的说明：<a href="https://docs.python.org/zh-cn/3/library/typing.html?highlight=typing#the-any-type" target="_blank" rel="noopener">https://docs.python.org/zh-cn/3/library/typing.html?highlight=typing#the-any-type</a>。</p>
                  <h3 id="TypeVar"><a href="#TypeVar" class="headerlink" title="TypeVar"></a>TypeVar</h3>
                  <p>TypeVar，我们可以借助它来自定义兼容特定类型的变量，比如有的变量声明为 int、float、None 都是符合要求的，实际就是代表任意的数字或者空内容都可以，其他的类型则不可以，比如列表 list、字典 dict 等等，像这样的情况，我们可以使用 TypeVar 来表示。 例如一个人的身高，便可以使用 int 或 float 或 None 来表示，但不能用 dict 来表示，所以可以这么声明：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">height = <span class="number">1.75</span></span><br><span class="line">Height = TypeVar(<span class="string">'Height'</span>, int, float, <span class="literal">None</span>)</span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_height</span><span class="params">()</span> -&gt; Height:</span></span><br><span class="line">    <span class="keyword">return</span> height</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里我们使用 TypeVar 声明了一个 Height 类型，然后将其用于注解方法的返回结果。</p>
                  <h3 id="NewType"><a href="#NewType" class="headerlink" title="NewType"></a>NewType</h3>
                  <p>NewType，我们可以借助于它来声明一些具有特殊含义的类型，例如像 Tuple 的例子一样，我们需要将它表示为 Person，即一个人的含义，但但从表面上声明为 Tuple 并不直观，所以我们可以使用 NewType 为其声明一个类型，如：</p>
                  <figure class="highlight ini">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">Person</span> = NewType(<span class="string">'Person'</span>, Tuple[str, int, float])</span><br><span class="line"><span class="attr">person</span> = Person((<span class="string">'Mike'</span>, <span class="number">22</span>, <span class="number">1.75</span>))</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里实际上 person 就是一个 tuple 类型，我们可以对其像 tuple 一样正常操作。</p>
                  <h3 id="Callable"><a href="#Callable" class="headerlink" title="Callable"></a>Callable</h3>
                  <p>Callable，可调用类型，它通常用来注解一个方法，比如我们刚才声明了一个 add 方法，它就是一个 Callable 类型：</p>
                  <figure class="highlight lisp">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">print(<span class="name">Callable</span>, type(<span class="name">add</span>), isinstance(<span class="name">add</span>, Callable))</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果：</p>
                  <figure class="highlight angelscript">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">typing.Callable &lt;<span class="keyword">class</span> '<span class="symbol">function</span>'&gt; <span class="symbol">True</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里虽然二者 add 利用 type 方法得到的结果是 function，但实际上利用 isinstance 方法判断确实是 True。 Callable 在声明的时候需要使用 <code>Callable[[Arg1Type, Arg2Type, ...], ReturnType]</code> 这样的类型注解，将参数类型和返回值类型都要注解出来，例如：</p>
                  <figure class="highlight groovy">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">def</span> date(<span class="string">year:</span> <span class="keyword">int</span>, <span class="string">month:</span> <span class="keyword">int</span>, <span class="string">day:</span> <span class="keyword">int</span>) -&gt; <span class="string">str:</span></span><br><span class="line">    <span class="keyword">return</span> f<span class="string">'&#123;year&#125;-&#123;month&#125;-&#123;day&#125;'</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">def</span> get_date_fn() -&gt; Callable[[<span class="keyword">int</span>, <span class="keyword">int</span>, <span class="keyword">int</span>], str]:</span><br><span class="line">    <span class="keyword">return</span> date</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里首先声明了一个方法 date，接收三个 int 参数，返回一个 str 结果，get_date_fn 方法返回了这个方法本身，它的返回值类型就可以标记为 Callable，中括号内分别标记了返回的方法的参数类型和返回值类型。</p>
                  <h3 id="Union"><a href="#Union" class="headerlink" title="Union"></a>Union</h3>
                  <p>Union，联合类型，<code>Union[X, Y]</code> 代表要么是 X 类型，要么是 Y 类型。 联合类型的联合类型等价于展平后的类型：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">Union</span>[<span class="keyword">Union</span>[<span class="type">int</span>, str], <span class="type">float</span>] == <span class="keyword">Union</span>[<span class="type">int</span>, str, <span class="type">float</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>仅有一个参数的联合类型会坍缩成参数自身，比如：</p>
                  <figure class="highlight pgsql">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">Union</span>[<span class="type">int</span>] == <span class="type">int</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>多余的参数会被跳过，比如：</p>
                  <figure class="highlight axapta">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Union[<span class="keyword">int</span>, <span class="keyword">str</span>, <span class="keyword">int</span>] == Union[<span class="keyword">int</span>, <span class="keyword">str</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在比较联合类型的时候，参数顺序会被忽略，比如：</p>
                  <figure class="highlight axapta">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">Union[<span class="keyword">int</span>, <span class="keyword">str</span>] == Union[<span class="keyword">str</span>, <span class="keyword">int</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这个在一些方法参数声明的时候比较有用，比如一个方法，要么传一个字符串表示的方法名，要么直接把方法传过来：</p>
                  <figure class="highlight armasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="symbol">def</span> process(<span class="meta">fn</span>: Union[<span class="keyword">str, </span>Callable]):</span><br><span class="line">    <span class="meta">if</span> isinstance(<span class="meta">fn</span>, <span class="keyword">str):</span></span><br><span class="line"><span class="keyword"> </span>       # <span class="keyword">str2fn </span><span class="keyword">and </span>process</span><br><span class="line">        pass</span><br><span class="line">    <span class="meta">elif</span> isinstance(<span class="meta">fn</span>, Callable):</span><br><span class="line">        <span class="meta">fn</span>()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样的声明在一些类库方法定义的时候十分常见。</p>
                  <h3 id="Optional"><a href="#Optional" class="headerlink" title="Optional"></a>Optional</h3>
                  <p>Optional，意思是说这个参数可以为空或已经声明的类型，即 <code>Optional[X]</code> 等价于 <code>Union[X, None]</code>。 但值得注意的是，这个并不等价于可选参数，当它作为参数类型注解的时候，不代表这个参数可以不传递了，而是说这个参数可以传为 None。 如当一个方法执行结果，如果执行完毕就不返回错误信息， 如果发生问题就返回错误信息，则可以这么声明：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">judge</span><span class="params">(result: bool)</span> -&gt; Optional[str]:</span></span><br><span class="line">    <span class="keyword">if</span> result: <span class="keyword">return</span> <span class="string">'Error Occurred'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h3 id="Generator"><a href="#Generator" class="headerlink" title="Generator"></a>Generator</h3>
                  <p>如果想代表一个生成器类型，可以使用 Generator，它的声明比较特殊，其后的中括号紧跟着三个参数，分别代表 YieldType、SendType、ReturnType，如：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">echo_round</span><span class="params">()</span> -&gt; Generator[int, float, str]:</span></span><br><span class="line">    sent = <span class="keyword">yield</span> <span class="number">0</span></span><br><span class="line">    <span class="keyword">while</span> sent &gt;= <span class="number">0</span>:</span><br><span class="line">        sent = <span class="keyword">yield</span> round(sent)</span><br><span class="line">    <span class="keyword">return</span> <span class="string">'Done'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里 yield 关键字后面紧跟的变量的类型就是 YieldType，yield 返回的结果的类型就是 SendType，最后生成器 return 的内容就是 ReturnType。 当然很多情况下，生成器往往只需要 yield 内容就够了，我们是不需要 SendType 和 ReturnType 的，可以将其设置为空，如：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">infinite_stream</span><span class="params">(start: int)</span> -&gt; Generator[int, <span class="keyword">None</span>, <span class="keyword">None</span>]:</span></span><br><span class="line">    <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">        <span class="keyword">yield</span> start</span><br><span class="line">        start += <span class="number">1</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <h2 id="案例实战"><a href="#案例实战" class="headerlink" title="案例实战"></a>案例实战</h2>
                  <p>接下来让我们看一个实际的项目，看看经常用到的类型一般是怎么使用的。 这里我们看的库是 requests-html，是由 Kenneth Reitz 所开发的，其 GitHub 地址为：<a href="https://github.com/psf/requests-html" target="_blank" rel="noopener">https://github.com/psf/requests-html</a>，下面我们主要看看它的源代码中一些类型是如何声明的。 这个库的源代码其实就一个文件，那就是 <a href="https://github.com/psf/requests-html/blob/master/requests_html.py" target="_blank" rel="noopener">https://github.com/psf/requests-html/blob/master/requests_html.py</a>，我们看一下它里面的一些 typing 的定义和方法定义。 首先 Typing 的定义部分如下：</p>
                  <figure class="highlight sqf">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> typing import <span class="built_in">Set</span>, Union, <span class="built_in">List</span>, MutableMapping, Optional</span><br><span class="line"></span><br><span class="line"><span class="variable">_Find</span> = Union[<span class="built_in">List</span>[<span class="string">'Element'</span>], <span class="string">'Element'</span>]</span><br><span class="line"><span class="variable">_XPath</span> = Union[<span class="built_in">List</span>[<span class="built_in">str</span>], <span class="built_in">List</span>[<span class="string">'Element'</span>], <span class="built_in">str</span>, <span class="string">'Element'</span>]</span><br><span class="line"><span class="variable">_Result</span> = Union[<span class="built_in">List</span>[<span class="string">'Result'</span>], <span class="string">'Result'</span>]</span><br><span class="line"><span class="variable">_HTML</span> = Union[<span class="built_in">str</span>, bytes]</span><br><span class="line"><span class="variable">_BaseHTML</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_UserAgent</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_DefaultEncoding</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_URL</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_RawHTML</span> = bytes</span><br><span class="line"><span class="variable">_Encoding</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_LXML</span> = HtmlElement</span><br><span class="line"><span class="variable">_Text</span> = <span class="built_in">str</span></span><br><span class="line"><span class="variable">_Search</span> = Result</span><br><span class="line"><span class="variable">_Containing</span> = Union[<span class="built_in">str</span>, <span class="built_in">List</span>[<span class="built_in">str</span>]]</span><br><span class="line"><span class="variable">_Links</span> = <span class="built_in">Set</span>[<span class="built_in">str</span>]</span><br><span class="line"><span class="variable">_Attrs</span> = MutableMapping</span><br><span class="line"><span class="variable">_Next</span> = Union[<span class="string">'HTML'</span>, <span class="built_in">List</span>[<span class="built_in">str</span>]]</span><br><span class="line"><span class="variable">_NextSymbol</span> = <span class="built_in">List</span>[<span class="built_in">str</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里可以看到主要用到的类型有 Set、Union、List、MutableMapping、Optional，这些在上文都已经做了解释，另外这里使用了多次 Union 来声明了一些新的类型，如 <code>_Find</code> 则要么是是 Element 对象的列表，要么是单个 Element 对象，<code>_Result</code> 则要么是 Result 对象的列表，要么是单个 Result 对象。另外 <code>_Attrs</code> 其实就是字典类型，这里用 MutableMapping 来表示了，没有用 Dict，也没有用 Mapping。 接下来再看一个 Element 类的声明：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Element</span><span class="params">(BaseParser)</span>:</span></span><br><span class="line">    <span class="string">"""An element of HTML.</span></span><br><span class="line"><span class="string">    :param element: The element from which to base the parsing upon.</span></span><br><span class="line"><span class="string">    :param url: The URL from which the HTML originated, used for ``absolute_links``.</span></span><br><span class="line"><span class="string">    :param default_encoding: Which encoding to default to.</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line"></span><br><span class="line">    __slots__ = [</span><br><span class="line">        <span class="string">'element'</span>, <span class="string">'url'</span>, <span class="string">'skip_anchors'</span>, <span class="string">'default_encoding'</span>, <span class="string">'_encoding'</span>,</span><br><span class="line">        <span class="string">'_html'</span>, <span class="string">'_lxml'</span>, <span class="string">'_pq'</span>, <span class="string">'_attrs'</span>, <span class="string">'session'</span></span><br><span class="line">    ]</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None)</span> -&gt; <span class="keyword">None</span>:</span></span><br><span class="line">        super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)</span><br><span class="line">        self.element = element</span><br><span class="line">        self.tag = element.tag</span><br><span class="line">        self.lineno = element.sourceline</span><br><span class="line">        self._attrs = <span class="literal">None</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__repr__</span><span class="params">(self)</span> -&gt; str:</span></span><br><span class="line">        attrs = [<span class="string">'&#123;&#125;=&#123;&#125;'</span>.format(attr, repr(self.attrs[attr])) <span class="keyword">for</span> attr <span class="keyword">in</span> self.attrs]</span><br><span class="line">        <span class="keyword">return</span> <span class="string">"&lt;Element &#123;&#125; &#123;&#125;&gt;"</span>.format(repr(self.element.tag), <span class="string">' '</span>.join(attrs))</span><br><span class="line"></span><br><span class="line"><span class="meta">    @property</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">attrs</span><span class="params">(self)</span> -&gt; _Attrs:</span></span><br><span class="line">        <span class="string">"""Returns a dictionary of the attributes of the :class:`Element &lt;Element&gt;`</span></span><br><span class="line"><span class="string">        (`learn more &lt;https://www.w3schools.com/tags/ref_attributes.asp&gt;`_).</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">if</span> self._attrs <span class="keyword">is</span> <span class="literal">None</span>:</span><br><span class="line">            self._attrs = &#123;k: v <span class="keyword">for</span> k, v <span class="keyword">in</span> self.element.items()&#125;</span><br><span class="line"></span><br><span class="line">            <span class="comment"># Split class and rel up, as there are ussually many of them:</span></span><br><span class="line">            <span class="keyword">for</span> attr <span class="keyword">in</span> [<span class="string">'class'</span>, <span class="string">'rel'</span>]:</span><br><span class="line">                <span class="keyword">if</span> attr <span class="keyword">in</span> self._attrs:</span><br><span class="line">                    self._attrs[attr] = tuple(self._attrs[attr].split())</span><br><span class="line"></span><br><span class="line">        <span class="keyword">return</span> self._attrs</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里 <code>__init__</code> 方法接收非常多的参数，同时使用 <code>_URL</code> 、<code>_DefaultEncoding</code> 进行了参数类型注解，另外 attrs 方法使用了 <code>_Attrs</code> 进行了返回结果类型注解。 整体看下来，每个参数的类型、返回值都进行了清晰地注解，代码可读性大大提高。 以上便是类型注解和 typing 模块的详细介绍。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-04 20:28:17" itemprop="dateCreated datePublished" datetime="2019-08-04T20:28:17+08:00">2019-08-04</time>
                </span>
                <span id="/7071.html" class="post-meta-item leancloud_visitors" data-flag-title="Python 中 typing 模块和类型注解的使用" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>9.2k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>8 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7051.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7051.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 9.3-付费讯代理、阿布云代理的使用</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>相对免费代理来说，付费代理的稳定性相对更高一点，本节介绍一下爬虫付费代理的相关使用过程。</p>
                  <h3 id="1-付费代理分类"><a href="#1-付费代理分类" class="headerlink" title="1. 付费代理分类"></a>1. 付费代理分类</h3>
                  <p>在这里将付费代理分为两类：</p>
                  <ul>
                    <li>提供接口获取海量代理，按天或者按量付费，如讯代理</li>
                    <li>搭建了代理隧道，直接设置固定域名代理，如阿布云</li>
                  </ul>
                  <p>本节讲解一下这两种代理的使用方法，分别以两家代表性的代理网站为例进行讲解。</p>
                  <h3 id="2-讯代理"><a href="#2-讯代理" class="headerlink" title="2. 讯代理"></a>2. 讯代理</h3>
                  <p>讯代理个人使用过代理有效率还是蛮高的，此处非广告，其官网为：<a href="http://www.xdaili.cn/" target="_blank" rel="noopener">http://www.xdaili.cn/</a>，如图 9-5 所示： <img src="./assets/9-5.png" alt=""> 图 9-5 讯代理官网 有多种类别的代理可供选购，摘抄其官网的各类别代理介绍如下：</p>
                  <ul>
                    <li>优质代理： 适合对代理IP需求量非常大，但能接受代理有效时长较短（10~30分钟)，小部分不稳定的客户</li>
                    <li>独享动态： 适合对代理IP稳定性要求非常高，且可以自主控制的客户，支持地区筛选。</li>
                    <li>独享秒切： 适合对代理IP稳定性要求非常高，且可以自主控制的客户，快速获取IP，地区随机分配</li>
                    <li>动态混拨： 适合对代理IP需求量大，代理IP使用时效短（3分钟），切换快的客户</li>
                    <li>优质定制： 如果优质代理的套餐不能满足您的需求，请使用定制服务</li>
                  </ul>
                  <p>一般选择第一类别优质代理即可，代理量比较大，但是代理的稳定性没那么高，有一些代理也是不可用的，所以这种代理的使用方式就需要借助于上一节所说的代理池，我们自己再做一次筛选，确保代理可用。 可以购买一天的试一下效果，购买之后会提供一个 API 来提取代理，如图 9-6 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060609.jpg" alt=""> 图 9-6 提取页面 比如在这里我的提取 API 为：<a href="http://www.xdaili.cn/ipagent/greatRecharge/getGreatIp?spiderId=da289b78fec24f19b392e04106253f2a&amp;orderno=YZ20177140586mTTnd7&amp;returnType=2&amp;count=20" target="_blank" rel="noopener">http://www.xdaili.cn/ipagent/greatRecharge/getGreatIp?spiderId=da289b78fec24f19b392e04106253f2a&amp;orderno=YZ20177140586mTTnd7&amp;returnType=2&amp;count=20</a>，可能已过期，在此仅做演示。 在这里指定了提取数量为 20，提取格式为 Json，直接访问链接即可提取代理，结果如图 9-7 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060621.jpg" alt=""> 图 9-7 提取结果 接下来我们要做的就是解析这个 Json，然后将其放入我们的代理池中。 当然如果信赖讯代理的话也可以不做代理池筛选，直接使用，不过我个人还是推荐再使用代理池筛选一遍，提高可用几率。 根据上一节代理池的写法，我们只需要在 Crawler 中再加入一个 crawl 开头的方法即可。 方法实现如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def crawl_xdaili(self):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        获取讯代理</span></span><br><span class="line"><span class="string">        :return: 代理</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        url = <span class="string">'http://www.xdaili.cn/ipagent/greatRecharge/getGreatIp?spiderId=da289b78fec24f19b392e04106253f2a&amp;orderno=YZ20177140586mTTnd7&amp;returnType=2&amp;count=20'</span></span><br><span class="line">        html = get_page(url)</span><br><span class="line">        <span class="keyword">if</span> html:</span><br><span class="line">            result = json.loads(html)</span><br><span class="line">            proxies = result.<span class="builtin-name">get</span>(<span class="string">'RESULT'</span>)</span><br><span class="line">            <span class="keyword">for</span><span class="built_in"> proxy </span><span class="keyword">in</span> proxies:</span><br><span class="line">                yield proxy.<span class="builtin-name">get</span>(<span class="string">'ip'</span>) + <span class="string">':'</span> + proxy.<span class="builtin-name">get</span>(<span class="string">'port'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样我们就在代理池中接入了讯代理，获取讯代理的结果之后，解析 Json，返回代理即可。 这样代理池运行之后就会抓取和检测该接口返回的代理了，如果可用，那么就会被设为 100，通过代理池接口即可获取到。 以上以讯代理为例说明了此种批量提取代理的使用方法。</p>
                  <h3 id="3-阿布云代理"><a href="#3-阿布云代理" class="headerlink" title="3. 阿布云代理"></a>3. 阿布云代理</h3>
                  <p>阿布云代理提供了代理隧道，代理速度快而且非常稳定，此处依然非广告，其官网为：<a href="https://www.abuyun.com/" target="_blank" rel="noopener">https://www.abuyun.com/</a>，如图 9-8 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060629.png" alt=""> 图 9-8 阿布云官网 阿布云的代理主要分为两种，专业版和动态版，另外还有定制版，摘抄官网的介绍如下：</p>
                  <ul>
                    <li>专业版，多个请求锁定一个代理 IP，海量 IP 资源池需求，近 300 个区域全覆盖，代理 IP 可连续使用1分钟，适用于请求 IP 连续型业务</li>
                    <li>动态版，每个请求一个随机代理 IP，海量 IP 资源池需求，近 300 个区域全覆盖，适用于爬虫类业务</li>
                    <li>定制版，灵活按照需求定制，定制 IP 区域，定制 IP 使用时长，定制 IP 每秒请求数</li>
                  </ul>
                  <p>关于专业版和动态版的更多介绍可以查看官网：<a href="https://www.abuyun.com/http-proxy/dyn-intro.html" target="_blank" rel="noopener">https://www.abuyun.com/http-proxy/dyn-intro.html</a>。 对于爬虫来说，推荐使用动态版，购买之后可以在后台看到代理隧道的用户名和密码，如图 9-9 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060650.jpg" alt=""> 图 9-9 阿布云代理后台 可以发现整个代理的连接域名为 proxy.abuyun.com，端口为 9020，均是固定的，但是使用之后每次的 IP 都会更改，这其实就是利用了代理隧道实现。 其官网原理介绍如下：</p>
                  <ul>
                    <li>云代理通过代理隧道的形式提供高匿名代理服务，支持 HTTP/HTTPS 协议。</li>
                    <li>云代理在云端维护一个全局 IP 池供代理隧道使用，池中的 IP 会不间断更新，以保证同一时刻 IP 池中有几十到几百个可用代理IP。</li>
                    <li>需要注意的是代理IP池中有部分 IP 可能会在当天重复出现多次。</li>
                    <li>动态版HTTP代理隧道会为每个请求从 IP 池中挑选一个随机代理 IP。</li>
                    <li>无须切换代理 IP，每一个请求一个随机代理IP。</li>
                    <li>HTTP代理隧道有并发请求限制，默认每秒只允许 5 个请求。如果需要更多请求数，请额外购买。</li>
                  </ul>
                  <p>注意默认套餐的并发请求是 5 个，如果需要更多需要另外购买。 使用的教程在官网也有，链接为：<a href="https://www.abuyun.com/http-proxy/dyn-manual-python.html" target="_blank" rel="noopener">https://www.abuyun.com/http-proxy/dyn-manual-python.html</a>，提供了 Requests、Urllib、Scrapy 的接入方式。 以 Requests 为例，接入示例如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line"></span><br><span class="line">url = <span class="string">'http://httpbin.org/get'</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 代理服务器</span></span><br><span class="line">proxy_host = <span class="string">'proxy.abuyun.com'</span></span><br><span class="line">proxy_port = <span class="string">'9020'</span></span><br><span class="line"></span><br><span class="line"><span class="comment"># 代理隧道验证信息</span></span><br><span class="line">proxy_user = <span class="string">'H01234567890123D'</span></span><br><span class="line">proxy_pass = <span class="string">'0123456789012345'</span></span><br><span class="line"></span><br><span class="line">proxy_meta = <span class="string">'http://%(user)s:%(pass)s@%(host)s:%(port)s'</span> % &#123;</span><br><span class="line">    <span class="string">'host'</span>: proxy_host,</span><br><span class="line">    <span class="string">'port'</span>: proxy_port,</span><br><span class="line">    <span class="string">'user'</span>: proxy_user,</span><br><span class="line">    <span class="string">'pass'</span>: proxy_pass,</span><br><span class="line">&#125;</span><br><span class="line">proxies = &#123;</span><br><span class="line">    <span class="string">'http'</span>: proxy_meta,</span><br><span class="line">    <span class="string">'https'</span>: proxy_meta,</span><br><span class="line">&#125;</span><br><span class="line">response = requests.<span class="builtin-name">get</span>(url, <span class="attribute">proxies</span>=proxies)</span><br><span class="line"><span class="builtin-name">print</span>(response.status_code)</span><br><span class="line"><span class="builtin-name">print</span>(response.text)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里其实就是使用了代理认证，在前面我们也提到过类似的设置方法，运行结果如下：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="number">200</span></span><br><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept"</span>: <span class="string">"*/*"</span>, </span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"gzip, deflate"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"python-requests/2.18.1"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"60.207.237.111"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>输出结果的 origin 即为代理IP的实际地址，可以多次运行测试，可以发现每次请求 origin 都会在变化，这就是动态版代理的效果。 这种效果其实跟我们之前的代理池的随机代理效果类似，都是随机取出了一个当前可用代理。 但是此服务相比于维护代理池来说，使用更加方便，配置简单，省时省力，在价格可以接受的情况下，个人推荐此种代理。</p>
                  <h3 id="4-结语"><a href="#4-结语" class="headerlink" title="4. 结语"></a>4. 结语</h3>
                  <p>以上便是付费代理的相关使用方法，稳定性相比免费代理更高，可以自行选购合适的代理。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 14:07:57" itemprop="dateCreated datePublished" datetime="2019-08-02T14:07:57+08:00">2019-08-02</time>
                </span>
                <span id="/7051.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 9.3-付费讯代理、阿布云代理的使用" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>3.3k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>3 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7048.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7048.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 9.2-代理池的维护</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>我们在上一节了解了代理的设置方法，利用代理我们可以解决目标网站封 IP 的问题，而在网上又有大量公开的免费代理，其中有一部分可以拿来使用，或者我们也可以购买付费的代理 IP，价格也不贵。但是不论是免费的还是付费的，都不能保证它们每一个都是可用的，毕竟可能其他人也可能在用此 IP 爬取同样的目标站点而被封禁，或者代理服务器突然出故障或网络繁忙。一旦我们选用了一个不可用的代理，势必会影响我们爬虫的工作效率。 所以说，在用代理时，我们需要提前做一下筛选，将不可用的代理剔除掉，保留下可用代理，接下来在获取代理时从可用代理里面取出直接使用就好了。 所以本节我们来搭建一个高效易用的代理池。</p>
                  <h3 id="1-准备工作"><a href="#1-准备工作" class="headerlink" title="1. 准备工作"></a>1. 准备工作</h3>
                  <p>要实现代理池我们首先需要成功安装好了 Redis 数据库并启动服务，另外还需要安装 Aiohttp、Requests、RedisPy、PyQuery、Flask 库，如果没有安装可以参考第一章的安装说明。</p>
                  <h3 id="2-代理池的目标"><a href="#2-代理池的目标" class="headerlink" title="2. 代理池的目标"></a>2. 代理池的目标</h3>
                  <p>代理池要做到易用、高效，我们一般需要做到下面的几个目标：</p>
                  <ul>
                    <li>基本模块分为四块，获取模块、存储模块、检查模块、接口模块。</li>
                    <li>获取模块需要定时去各大代理网站抓取代理，代理可以是免费公开代理也可以是付费代理，代理的形式都是 IP 加端口，尽量从不同来源获取，尽量抓取高匿代理，抓取完之后将可用代理保存到数据库中。</li>
                    <li>存储模块负责存储抓取下来的代理。首先我们需要保证代理不重复，另外我们还需要标识代理的可用情况，而且需要动态实时处理每个代理，所以说，一种比较高效和方便的存储方式就是使用 Redis 的 Sorted Set，也就是有序集合。</li>
                    <li>检测模块需要定时将数据库中的代理进行检测，在这里我们需要设置一个检测链接，最好是爬取哪个网站就检测哪个网站，这样更加有针对性，如果要做一个通用型的代理，那可以设置百度等链接来检测。另外我们需要标识每一个代理的状态，如设置分数标识，100 分代表可用，分数越少代表越不可用，检测一次如果可用，我们可以将其立即设置为100 满分，也可以在原基础上加 1 分，当不可用，可以将其减 1 分，当减到一定阈值后就直接从数据库移除。通过这样的标识分数，我们就可以区分出代理的可用情况，选用的时候会更有针对性。</li>
                    <li>接口模块需要用 API 来提供对外服务的接口，其实我们可以直接连数据库来取，但是这样就需要知道数据库的连接信息，不太安全，而且需要配置连接，所以一个比较安全和方便的方式就是提供一个 Web API 接口，通过访问接口即可拿到可用代理。另外由于可用代理可能有多个，我们可以提供随机返回一个可用代理的接口，这样保证每个可用代理都可以取到，实现负载均衡。</li>
                  </ul>
                  <p>以上便是设计代理的一些基本思路，那么接下来我们就设计一下整体的架构，然后用代码该实现代理池。</p>
                  <h3 id="3-代理池的架构"><a href="#3-代理池的架构" class="headerlink" title="3. 代理池的架构"></a>3. 代理池的架构</h3>
                  <p>根据上文的描述，代理池的架构可以是这样的，如图 9-1 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060342.jpg" alt=""> 图 9-1 代理池架构 代理池分为四个部分，获取模块、存储模块、检测模块、接口模块。</p>
                  <ul>
                    <li>存储模块使用Redis的有序集合，用以代理的去重和状态标识，同时它也是中心模块和基础模块，将其他模块串联起来。</li>
                    <li>获取模块定时从代理网站获取代理，将获取的代理传递给存储模块，保存到数据库。</li>
                    <li>检测模块定时通过存储模块获取所有代理，并对其进行检测，根据不同的检测结果对代理设置不同的标识。</li>
                    <li>接口模块通过 Web API 提供服务接口，其内部还是连接存储模块，获取可用的代理。</li>
                  </ul>
                  <h3 id="4-代理池的实现"><a href="#4-代理池的实现" class="headerlink" title="4. 代理池的实现"></a>4. 代理池的实现</h3>
                  <p>接下来我们分别用代码来实现一下这四个模块。</p>
                  <h4 id="存储模块"><a href="#存储模块" class="headerlink" title="存储模块"></a>存储模块</h4>
                  <p>存储在这里我们使用 Redis 的有序集合，集合的每一个元素都是不重复的，对于代理代理池来说，集合的元素就变成了一个个代理，也就是 IP 加端口的形式，如 60.207.237.111:8888，这样的一个代理就是集合的一个元素。另外有序集合的每一个元素还都有一个分数字段，分数是可以重复的，是一个浮点数类型，也可以是整数类型。该集合会根据每一个元素的分数对集合进行排序，数值小的排在前面，数值大的排在后面，这样就可以实现集合元素的排序了。 对于代理池来说，这个分数可以作为我们判断一个代理可用不可用的标志，我们将 100 设为最高分，代表可用，0 设为最低分，代表不可用。从代理池中获取代理的时候会随机获取分数最高的代理，注意这里是随机，这样可以保证每个可用代理都会被调用到。 分数是我们判断代理稳定性的重要标准，在这里我们设置分数规则如下：</p>
                  <ul>
                    <li>分数 100 为可用，检测器会定时循环检测每个代理可用情况，一旦检测到有可用的代理就立即置为 100，检测到不可用就将分数减 1，减至 0 后移除。</li>
                    <li>新获取的代理添加时将分数置为 10，当测试可行立即置 100，不可行分数减 1，减至 0 后移除。</li>
                  </ul>
                  <p>这是一种解决方案，当然可能还有更合理的方案。此方案的设置有一定的原因，在此总结如下：</p>
                  <ul>
                    <li>当检测到代理可用时立即置为 100，这样可以保证所有可用代理有更大的机会被获取到。你可能会说为什么不直接将分数加 1 而是直接设为最高 100 呢？设想一下，我们有的代理是从各大免费公开代理网站获取的，如果一个代理并没有那么稳定，平均五次请求有两次成功，三次失败，如果按照这种方式来设置分数，那么这个代理几乎不可能达到一个高的分数，也就是说它有时是可用的，但是我们筛选是筛选的分数最高的，所以这样的代理就几乎不可能被取到，当然如果想追求代理稳定性的化可以用这种方法，这样可确保分数最高的一定是最稳定可用的。但是在这里我们采取可用即设置 100 的方法，确保只要可用的代理都可以被使用到。</li>
                    <li>当检测到代理不可用时，将分数减 1，减至 0 后移除，一共 100 次机会，也就是说当一个可用代理接下来如果尝试了 100 次都失败了，就一直减分直到移除，一旦成功就重新置回 100，尝试机会越多代表将这个代理拯救回来的机会越多，这样不容易将曾经的一个可用代理丢弃，因为代理不可用的原因可能是网络繁忙或者其他人用此代理请求太过频繁，所以在这里设置为 100 级。</li>
                    <li>新获取的代理分数设置为 10，检测如果不可用就减 1，减到 0 就移除，如果可用就置 100。由于我们很多代理是从免费网站获取的，所以新获取的代理无效的可能性是非常高的，可能不足 10%，所以在这里我们将其设置为 10，检测的机会没有可用代理 100 次那么多，这也可以适当减少开销。</li>
                  </ul>
                  <p>以上便是代理分数的一个设置思路，不一定是最优思路，但个人实测实用性还是比较强的。 所以我们就需要定义一个类来操作数据库的有序集合，定义一些方法来实现分数的设置，代理的获取等等。 实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br><span class="line">88</span><br><span class="line">89</span><br><span class="line">90</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">MAX_SCORE = <span class="number">100</span></span><br><span class="line">MIN_SCORE = <span class="number">0</span></span><br><span class="line">INITIAL_SCORE = <span class="number">10</span></span><br><span class="line">REDIS_HOST = <span class="string">'localhost'</span></span><br><span class="line">REDIS_PORT = <span class="number">6379</span></span><br><span class="line">REDIS_PASSWORD = <span class="literal">None</span></span><br><span class="line">REDIS_KEY = <span class="string">'proxies'</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> redis</span><br><span class="line"><span class="keyword">from</span> random <span class="keyword">import</span> choice</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">RedisClient</span><span class="params">(object)</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        初始化</span></span><br><span class="line"><span class="string">        :param host: Redis 地址</span></span><br><span class="line"><span class="string">        :param port: Redis 端口</span></span><br><span class="line"><span class="string">        :param password: Redis密码</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=<span class="literal">True</span>)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">add</span><span class="params">(self, proxy, score=INITIAL_SCORE)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        添加代理，设置分数为最高</span></span><br><span class="line"><span class="string">        :param proxy: 代理</span></span><br><span class="line"><span class="string">        :param score: 分数</span></span><br><span class="line"><span class="string">        :return: 添加结果</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> self.db.zscore(REDIS_KEY, proxy):</span><br><span class="line">            <span class="keyword">return</span> self.db.zadd(REDIS_KEY, score, proxy)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">random</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        随机获取有效代理，首先尝试获取最高分数代理，如果不存在，按照排名获取，否则异常</span></span><br><span class="line"><span class="string">        :return: 随机代理</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)</span><br><span class="line">        <span class="keyword">if</span> len(result):</span><br><span class="line">            <span class="keyword">return</span> choice(result)</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            result = self.db.zrevrange(REDIS_KEY, <span class="number">0</span>, <span class="number">100</span>)</span><br><span class="line">            <span class="keyword">if</span> len(result):</span><br><span class="line">                <span class="keyword">return</span> choice(result)</span><br><span class="line">            <span class="keyword">else</span>:</span><br><span class="line">                <span class="keyword">raise</span> PoolEmptyError</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">decrease</span><span class="params">(self, proxy)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        代理值减一分，小于最小值则删除</span></span><br><span class="line"><span class="string">        :param proxy: 代理</span></span><br><span class="line"><span class="string">        :return: 修改后的代理分数</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        score = self.db.zscore(REDIS_KEY, proxy)</span><br><span class="line">        <span class="keyword">if</span> score <span class="keyword">and</span> score &gt; MIN_SCORE:</span><br><span class="line">            print(<span class="string">'代理'</span>, proxy, <span class="string">'当前分数'</span>, score, <span class="string">'减1'</span>)</span><br><span class="line">            <span class="keyword">return</span> self.db.zincrby(REDIS_KEY, proxy, <span class="number">-1</span>)</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            print(<span class="string">'代理'</span>, proxy, <span class="string">'当前分数'</span>, score, <span class="string">'移除'</span>)</span><br><span class="line">            <span class="keyword">return</span> self.db.zrem(REDIS_KEY, proxy)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">exists</span><span class="params">(self, proxy)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        判断是否存在</span></span><br><span class="line"><span class="string">        :param proxy: 代理</span></span><br><span class="line"><span class="string">        :return: 是否存在</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">return</span> <span class="keyword">not</span> self.db.zscore(REDIS_KEY, proxy) == <span class="literal">None</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">max</span><span class="params">(self, proxy)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        将代理设置为MAX_SCORE</span></span><br><span class="line"><span class="string">        :param proxy: 代理</span></span><br><span class="line"><span class="string">        :return: 设置结果</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        print(<span class="string">'代理'</span>, proxy, <span class="string">'可用，设置为'</span>, MAX_SCORE)</span><br><span class="line">        <span class="keyword">return</span> self.db.zadd(REDIS_KEY, MAX_SCORE, proxy)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">count</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        获取数量</span></span><br><span class="line"><span class="string">        :return: 数量</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">return</span> self.db.zcard(REDIS_KEY)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">all</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        获取全部代理</span></span><br><span class="line"><span class="string">        :return: 全部代理列表</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">return</span> self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>首先定义了一些常量，如 MAX_SCORE、MIN_SCORE、INITIAL_SCORE 分别代表最大分数、最小分数、初始分数。REDIS_HOST、REDIS_PORT、REDIS_PASSWORD 分别代表了 Redis 的连接信息，即地址、端口、密码。REDIS_KEY 是有序集合的键名，可以通过它来获取代理存储所使用的有序集合。 接下来定义了一个 RedisClient 类，用以操作 Redis 的有序集合，其中定义了一些方法来对集合中的元素进行处理，主要功能如下：</p>
                  <ul>
                    <li><strong>init</strong>() 方法是初始化的方法，参数是Redis的连接信息，默认的连接信息已经定义为常量，在 <strong>init</strong>() 方法中初始化了一个 StrictRedis 的类，建立 Redis 连接。这样当 RedisClient 类初始化的时候就建立了Redis的连接。</li>
                    <li>add() 方法向数据库添加代理并设置分数，默认的分数是 INITIAL_SCORE 也就是 10，返回结果是添加的结果。</li>
                    <li>random() 方法是随机获取代理的方法，首先获取 100 分的代理，然后随机选择一个返回，如果不存在 100 分的代理，则按照排名来获取，选取前 100 名，然后随机选择一个返回，否则抛出异常。</li>
                    <li>decrease() 方法是在代理检测无效的时候设置分数减 1 的方法，传入代理，然后将此代理的分数减 1，如果达到最低值，那么就删除。</li>
                    <li>exists() 方法判断代理是否存在集合中</li>
                    <li>max() 方法是将代理的分数设置为 MAX_SCORE，即 100，也就是当代理有效时的设置。</li>
                    <li>count() 方法返回当前集合的元素个数。</li>
                    <li>all() 方法返回所有的代理列表，供检测使用。</li>
                  </ul>
                  <p>定义好了这些方法，我们可以在后续的模块中调用此类来连接和操作数据库，非常方便。如我们想要获取随机可用的代理，只需要调用 random() 方法即可，得到的就是随机的可用代理。</p>
                  <h4 id="获取模块"><a href="#获取模块" class="headerlink" title="获取模块"></a>获取模块</h4>
                  <p>获取模块的逻辑相对简单，首先需要定义一个 Crawler 来从各大网站抓取代理，示例如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import json</span><br><span class="line"><span class="keyword">from</span> .utils import get_page</span><br><span class="line"><span class="keyword">from</span> pyquery import PyQuery as pq</span><br><span class="line"></span><br><span class="line">class ProxyMetaclass(type):</span><br><span class="line">    def __new__(cls, name, bases, attrs):</span><br><span class="line">        count = 0</span><br><span class="line">        attrs[<span class="string">'__CrawlFunc__'</span>] = []</span><br><span class="line">        <span class="keyword">for</span> k, v <span class="keyword">in</span> attrs.items():</span><br><span class="line">            <span class="keyword">if</span> <span class="string">'crawl_'</span> <span class="keyword">in</span> k:</span><br><span class="line">                attrs[<span class="string">'__CrawlFunc__'</span>].append(k)</span><br><span class="line">                count += 1</span><br><span class="line">        attrs[<span class="string">'__CrawlFuncCount__'</span>] = count</span><br><span class="line">        return type.__new__(cls, name, bases, attrs)</span><br><span class="line"></span><br><span class="line">class Crawler(object, <span class="attribute">metaclass</span>=ProxyMetaclass):</span><br><span class="line">    def get_proxies(self, callback):</span><br><span class="line">        proxies = []</span><br><span class="line">        <span class="keyword">for</span><span class="built_in"> proxy </span><span class="keyword">in</span> eval(<span class="string">"self.&#123;&#125;()"</span>.format(callback)):</span><br><span class="line">            <span class="builtin-name">print</span>(<span class="string">'成功获取到代理'</span>, proxy)</span><br><span class="line">            proxies.append(proxy)</span><br><span class="line">        return proxies</span><br><span class="line"></span><br><span class="line">    def crawl_daili66(self, <span class="attribute">page_count</span>=4):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        获取代理66</span></span><br><span class="line"><span class="string">        :param page_count: 页码</span></span><br><span class="line"><span class="string">        :return: 代理</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        start_url = <span class="string">'http://www.66ip.cn/&#123;&#125;.html'</span></span><br><span class="line">        urls = [start_url.format(page) <span class="keyword">for</span><span class="built_in"> page </span><span class="keyword">in</span> range(1, page_count + 1)]</span><br><span class="line">        <span class="keyword">for</span> url <span class="keyword">in</span> urls:</span><br><span class="line">            <span class="builtin-name">print</span>(<span class="string">'Crawling'</span>, url)</span><br><span class="line">            html = get_page(url)</span><br><span class="line">            <span class="keyword">if</span> html:</span><br><span class="line">                doc = pq(html)</span><br><span class="line">                trs = doc(<span class="string">'.containerbox table tr:gt(0)'</span>).items()</span><br><span class="line">                <span class="keyword">for</span> tr <span class="keyword">in</span> trs:</span><br><span class="line">                   <span class="built_in"> ip </span>= tr.<span class="builtin-name">find</span>(<span class="string">'td:nth-child(1)'</span>).text()</span><br><span class="line">                   <span class="built_in"> port </span>= tr.<span class="builtin-name">find</span>(<span class="string">'td:nth-child(2)'</span>).text()</span><br><span class="line">                    yield <span class="string">':'</span>.join([ip, port])</span><br><span class="line"></span><br><span class="line">    def crawl_proxy360(self):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        获取Proxy360</span></span><br><span class="line"><span class="string">        :return: 代理</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        start_url = <span class="string">'http://www.proxy360.cn/Region/China'</span></span><br><span class="line">        <span class="builtin-name">print</span>(<span class="string">'Crawling'</span>, start_url)</span><br><span class="line">        html = get_page(start_url)</span><br><span class="line">        <span class="keyword">if</span> html:</span><br><span class="line">            doc = pq(html)</span><br><span class="line">            lines = doc(<span class="string">'div[name="list_proxy_ip"]'</span>).items()</span><br><span class="line">            <span class="keyword">for</span> line <span class="keyword">in</span> lines:</span><br><span class="line">               <span class="built_in"> ip </span>= line.<span class="builtin-name">find</span>(<span class="string">'.tbBottomLine:nth-child(1)'</span>).text()</span><br><span class="line">               <span class="built_in"> port </span>= line.<span class="builtin-name">find</span>(<span class="string">'.tbBottomLine:nth-child(2)'</span>).text()</span><br><span class="line">                yield <span class="string">':'</span>.join([ip, port])</span><br><span class="line"></span><br><span class="line">    def crawl_goubanjia(self):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        获取Goubanjia</span></span><br><span class="line"><span class="string">        :return: 代理</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        start_url = <span class="string">'http://www.goubanjia.com/free/gngn/index.shtml'</span></span><br><span class="line">        html = get_page(start_url)</span><br><span class="line">        <span class="keyword">if</span> html:</span><br><span class="line">            doc = pq(html)</span><br><span class="line">            tds = doc(<span class="string">'td.ip'</span>).items()</span><br><span class="line">            <span class="keyword">for</span> td <span class="keyword">in</span> tds:</span><br><span class="line">                td.<span class="builtin-name">find</span>(<span class="string">'p'</span>).<span class="builtin-name">remove</span>()</span><br><span class="line">                yield td.text().replace(<span class="string">' '</span>, <span class="string">''</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>为了实现灵活，在这里我们将获取代理的一个个方法统一定义一个规范，如统一定义以 crawl 开头，这样扩展的时候只需要添加 crawl 开头的方法即可。 在这里实现了几个示例，如抓取代理 66、Proxy360、Goubanjia 三个免费代理网站，这些方法都定义成了生成器，通过 yield 返回一个个代理。首先将网页获取，然后用PyQuery 解析，解析出IP加端口的形式的代理然后返回。 然后定义了一个 get_proxies() 方法，将所有以 crawl 开头的方法调用一遍，获取每个方法返回的代理并组合成列表形式返回。 你可能会想知道是怎样获取了所有以 crawl 开头的方法名称的。其实这里借助于元类来实现，定义了一个 ProxyMetaclass，Crawl 类将它设置为元类，元类中实现了 <strong>new</strong>() 方法，这个方法有固定的几个参数，其中第四个参数 attrs 中包含了类的一些属性，这其中就包含了类中方法的一些信息，我们可以遍历 attrs 这个变量即可获取类的所有方法信息。所以在这里我们在 <strong>new</strong>() 方法中遍历了 attrs 的这个属性，就像遍历一个字典一样，键名对应的就是方法的名称，接下来判断其开头是否是 crawl，如果是，则将其加入到 <strong>CrawlFunc</strong> 属性中，这样我们就成功将所有以 crawl 开头的方法定义成了一个属性，就成功动态地获取到所有以 crawl 开头的方法列表了。 所以说，如果要做扩展的话，我们只需要添加一个以 crawl开头的方法，例如抓取快代理，我们只需要在 Crawler 类中增加 crawl_kuaidaili() 方法，仿照其他的几个方法将其定义成生成器，抓取其网站的代理，然后通过 yield 返回代理即可，所以这样我们可以非常方便地扩展，而不用关心类其他部分的实现逻辑。 代理网站的添加非常灵活，不仅可以添加免费代理，也可以添加付费代理，一些付费代理的提取方式其实也类似，也是通过 Web 的形式获取，然后进行解析，解析方式可能更加简单，如解析纯文本或 Json，解析之后以同样的方式返回即可，在此不再添加，可以自行扩展。 既然定义了这个 Crawler 类，我们就要调用啊，所以在这里再定义一个 Getter 类，动态地调用所有以 crawl 开头的方法，然后获取抓取到的代理，将其加入到数据库存储起来。</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> db <span class="keyword">import</span> RedisClient</span><br><span class="line"><span class="keyword">from</span> crawler <span class="keyword">import</span> Crawler</span><br><span class="line"></span><br><span class="line">POOL_UPPER_THRESHOLD = <span class="number">10000</span></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Getter</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self)</span>:</span></span><br><span class="line">        self.redis = RedisClient()</span><br><span class="line">        self.crawler = Crawler()</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">is_over_threshold</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        判断是否达到了代理池限制</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">if</span> self.redis.count() &gt;= POOL_UPPER_THRESHOLD:</span><br><span class="line">            <span class="keyword">return</span> <span class="literal">True</span></span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="keyword">return</span> <span class="literal">False</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">run</span><span class="params">(self)</span>:</span></span><br><span class="line">        print(<span class="string">'获取器开始执行'</span>)</span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">not</span> self.is_over_threshold():</span><br><span class="line">            <span class="keyword">for</span> callback_label <span class="keyword">in</span> range(self.crawler.__CrawlFuncCount__):</span><br><span class="line">                callback = self.crawler.__CrawlFunc__[callback_label]</span><br><span class="line">                proxies = self.crawler.get_proxies(callback)</span><br><span class="line">                <span class="keyword">for</span> proxy <span class="keyword">in</span> proxies:</span><br><span class="line">                    self.redis.add(proxy)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>Getter 类就是获取器类，这其中定义了一个变量 POOL_UPPER_THRESHOLD 表示代理池的最大数量，这个数量可以灵活配置，然后定义了 is_over_threshold() 方法判断代理池是否已经达到了容量阈值，它就是调用了 RedisClient 的 count() 方法获取代理的数量，然后加以判断，如果数量达到阈值则返回 True，否则 False。如果不想加这个限制可以将此方法永久返回 True。 接下来定义了 run() 方法，首先判断了代理池是否达到阈值，然后在这里就调用了 Crawler 类的 <strong>CrawlFunc</strong> 属性，获取到所有以 crawl 开头的方法列表，依次通过 get_proxies() 方法调用，得到各个方法抓取到的代理，然后再利用 RedisClient 的 add() 方法加入数据库，这样获取模块的工作就完成了。</p>
                  <h4 id="检测模块"><a href="#检测模块" class="headerlink" title="检测模块"></a>检测模块</h4>
                  <p>在获取模块中，我们已经成功将各个网站的代理获取下来了，然后就需要一个检测模块来对所有的代理进行一轮轮的检测，检测可用就设置为 100，不可用就分数减 1，这样就可以实时改变每个代理的可用情况，在获取有效代理的时候只需要获取分数高的代理即可。 由于代理的数量非常多，为了提高代理的检测效率，我们在这里使用异步请求库 Aiohttp 来进行检测。 Requests 作为一个同步请求库，我们在发出一个请求之后需要等待网页加载完成之后才能继续执行程序。也就是这个过程会阻塞在等待响应这个过程，如果服务器响应非常慢，比如一个请求等待十几秒，那么我们使用 Requests 完成一个请求就会需要十几秒的时间，中间其实就是一个等待响应的过程，程序也不会继续往下执行，而这十几秒的时间其实完全可以去做其他的事情，比如调度其他的请求或者进行网页解析等等。 异步请求库就解决了这个问题，它类似 JavaScript 中的回调，意思是说在请求发出之后，程序可以继续接下去执行去做其他的事情，当响应到达时，会通知程序再去处理这个响应，这样程序就没有被阻塞，充分把时间和资源利用起来，大大提高效率。 对于响应速度比较快的网站，可能 Requests 同步请求和 Aiohttp 异步请求的效果差距没那么大，可对于检测代理这种事情，一般是需要十多秒甚至几十秒的时间，这时候使用 Aiohttp 异步请求库的优势就大大体现出来了，效率可能会提高几十倍不止。 所以在这里我们的代理检测使用异步请求库 Aiohttp，实现示例如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">VALID_STATUS_CODES = [200]</span><br><span class="line">TEST_URL = <span class="string">'http://www.baidu.com'</span></span><br><span class="line">BATCH_TEST_SIZE = 100</span><br><span class="line"></span><br><span class="line">class Tester(object):</span><br><span class="line">    def __init__(self):</span><br><span class="line">        self.redis = RedisClient()</span><br><span class="line"></span><br><span class="line">    async def test_single_proxy(self, proxy):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        测试单个代理</span></span><br><span class="line"><span class="string">        :param proxy: 单个代理</span></span><br><span class="line"><span class="string">        :return: None</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        conn = aiohttp.TCPConnector(<span class="attribute">verify_ssl</span>=<span class="literal">False</span>)</span><br><span class="line">        async with aiohttp.ClientSession(<span class="attribute">connector</span>=conn) as session:</span><br><span class="line">            try:</span><br><span class="line">                <span class="keyword">if</span> isinstance(proxy, bytes):</span><br><span class="line">                   <span class="built_in"> proxy </span>= proxy.decode(<span class="string">'utf-8'</span>)</span><br><span class="line">                real_proxy = <span class="string">'http://'</span> + proxy</span><br><span class="line">                <span class="builtin-name">print</span>(<span class="string">'正在测试'</span>, proxy)</span><br><span class="line">                async with session.<span class="builtin-name">get</span>(TEST_URL, <span class="attribute">proxy</span>=real_proxy, <span class="attribute">timeout</span>=15) as response:</span><br><span class="line">                    <span class="keyword">if</span> response.status <span class="keyword">in</span> VALID_STATUS_CODES:</span><br><span class="line">                        self.redis.max(proxy)</span><br><span class="line">                        <span class="builtin-name">print</span>(<span class="string">'代理可用'</span>, proxy)</span><br><span class="line">                    <span class="keyword">else</span>:</span><br><span class="line">                        self.redis.decrease(proxy)</span><br><span class="line">                        <span class="builtin-name">print</span>(<span class="string">'请求响应码不合法'</span>, proxy)</span><br><span class="line">            except (ClientError, ClientConnectorError, TimeoutError, AttributeError):</span><br><span class="line">                self.redis.decrease(proxy)</span><br><span class="line">                <span class="builtin-name">print</span>(<span class="string">'代理请求失败'</span>, proxy)</span><br><span class="line"></span><br><span class="line">    def <span class="builtin-name">run</span>(self):</span><br><span class="line">        <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">        测试主函数</span></span><br><span class="line"><span class="string">        :return: None</span></span><br><span class="line"><span class="string">        "</span><span class="string">""</span></span><br><span class="line">        <span class="builtin-name">print</span>(<span class="string">'测试器开始运行'</span>)</span><br><span class="line">        try:</span><br><span class="line">            proxies = self.redis.all()</span><br><span class="line">            loop = asyncio.get_event_loop()</span><br><span class="line">            # 批量测试</span><br><span class="line">            <span class="keyword">for</span> i <span class="keyword">in</span> range(0, len(proxies), BATCH_TEST_SIZE):</span><br><span class="line">                test_proxies = proxies[i:i + BATCH_TEST_SIZE]</span><br><span class="line">                tasks = [self.test_single_proxy(proxy) <span class="keyword">for</span><span class="built_in"> proxy </span><span class="keyword">in</span> test_proxies]</span><br><span class="line">                loop.run_until_complete(asyncio.wait(tasks))</span><br><span class="line">                time.sleep(5)</span><br><span class="line">        except Exception as e:</span><br><span class="line">            <span class="builtin-name">print</span>(<span class="string">'测试器发生错误'</span>, e.args)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里定义了一个类 Tester，<strong>init</strong>() 方法中建立了一个 RedisClient 对象，供类中其他方法使用。接下来定义了一个 test_single_proxy() 方法，用来检测单个代理的可用情况，其参数就是被检测的代理，注意这个方法前面加了 async 关键词，代表这个方法是异步的，方法内部首先创建了 Aiohttp 的 ClientSession 对象，此对象类似于 Requests 的 Session 对象，可以直接调用该对象的 get() 方法来访问页面，在这里代理的设置方式是通过 proxy 参数传递给 get() 方法，请求方法前面也需要加上 async 关键词标明是异步请求，这也是 Aiohttp 使用时的常见写法。 测试的链接在这里定义常量为 TEST_URL，如果针对某个网站有抓取需求，建议将 TEST_URL 设置为目标网站的地址，因为在抓取的过程中，可能代理本身是可用的，但是该代理的 IP 已经被目标网站封掉了。例如，如要抓取知乎，可能其中某些代理是可以正常使用，比如访问百度等页面是完全没有问题的，但是可能对知乎来说可能就被封了，所以可以将 TEST_URL 设置为知乎的某个页面的链接，当请求失败时，当代理被封时，分数自然会减下来，就不会被取到了。 如果想做一个通用的代理池，则不需要专门设置 TEST_URL，可以设置为一个不会封 IP 的网站，也可以设置为百度这类响应稳定的网站。 另外我们还定义了 VALID_STATUS_CODES 变量，是一个列表形式，包含了正常的状态码，如可以定义成 [200]，当然对于某些检测目标网站可能会出现其他的状态码也是正常的，可以自行配置。 获取 Response 后需要判断响应的状态，如果状态码在 VALID_STATUS_CODES 这个列表里，则代表代理可用，调用 RedisClient 的 max() 方法将代理分数设为 100，否则调用 decrease() 方法将代理分数减 1，如果出现异常也同样将代理分数减 1。 另外在测试的时候设置了批量测试的最大值 BATCH_TEST_SIZE 为 100，也就是一批测试最多测试 100个，这可以避免当代理池过大时全部测试导致内存开销过大的问题。 随后在 run() 方法里面获取了所有的代理列表，使用 Aiohttp 分配任务，启动运行，这样就可以进行异步检测了，写法可以参考 Aiohttp 的官方示例：<a href="http://aiohttp.readthedocs.io/" target="_blank" rel="noopener">http://aiohttp.readthedocs.io/</a>。 这样测试模块的逻辑就完成了。</p>
                  <h4 id="接口模块"><a href="#接口模块" class="headerlink" title="接口模块"></a>接口模块</h4>
                  <p>通过上述三个模块我们已经可以做到代理的获取、检测和更新了，数据库中就会以有序集合的形式存储各个代理还有对应的分数，分数 100 代表可用，分数越小代表越不可用。 但是我们怎样来方便地获取可用代理呢？用 RedisClient 类来直接连接 Redis 然后调用 random() 方法获取当然没问题，这样做效率很高，但是有这么几个弊端：</p>
                  <ul>
                    <li>需要知道 Redis 的用户名和密码，如果这个代理池是给其他人使用的就需要告诉他连接的用户名和密码信息，这样是很不安全的。</li>
                    <li>代理池如果想持续运行需要部署在远程服务器上运行，如果远程服务器的 Redis 是只允许本地连接的，那么就没有办法远程直连 Redis 获取代理了。</li>
                    <li>如果爬虫所在的主机没有连接 Redis 的模块，或者爬虫不是由 Python 语言编写的，那么就无法使用 RedisClient 来获取代理了。</li>
                    <li>如果 RedisClient 类或者数据库结构有更新，那么在爬虫端还需要去同步这些更新。</li>
                  </ul>
                  <p>综上考虑，为了使得代理池可以作为一个独立服务运行，我们最好增加一个接口模块，以 Web API 的形式暴露可用代理。 这样获取代理只需要请求一下接口即可，以上的几个缺点弊端可以解决。 我们在这里使用一个比较轻量级的库 Flask 来实现这个接口模块，实现示例如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> flask <span class="keyword">import</span> Flask, g</span><br><span class="line"><span class="keyword">from</span> db <span class="keyword">import</span> RedisClient</span><br><span class="line"></span><br><span class="line">__all__ = [<span class="string">'app'</span>]</span><br><span class="line">app = Flask(__name__)</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_conn</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="keyword">if</span> <span class="keyword">not</span> hasattr(g, <span class="string">'redis'</span>):</span><br><span class="line">        g.redis = RedisClient()</span><br><span class="line">    <span class="keyword">return</span> g.redis</span><br><span class="line"></span><br><span class="line"><span class="meta">@app.route('/')</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">index</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="keyword">return</span> <span class="string">'&lt;h2&gt;Welcome to Proxy Pool System&lt;/h2&gt;'</span></span><br><span class="line"></span><br><span class="line"><span class="meta">@app.route('/random')</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_proxy</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取随机可用代理</span></span><br><span class="line"><span class="string">    :return: 随机代理</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    conn = get_conn()</span><br><span class="line">    <span class="keyword">return</span> conn.random()</span><br><span class="line"></span><br><span class="line"><span class="meta">@app.route('/count')</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_counts</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取代理池总量</span></span><br><span class="line"><span class="string">    :return: 代理池总量</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    conn = get_conn()</span><br><span class="line">    <span class="keyword">return</span> str(conn.count())</span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    app.run()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们声明了一个 Flask 对象，定义了三个接口，分别是首页、随机代理页、获取数量页。 运行之后 Flask 会启动一个 Web 服务，我们只需要访问对应的接口即可获取到可用代理。</p>
                  <h4 id="调度模块"><a href="#调度模块" class="headerlink" title="调度模块"></a>调度模块</h4>
                  <p>这个模块其实就是调用以上所定义的三个模块，将以上三个模块通过多进程的形式运行起来，示例如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">TESTER_CYCLE = <span class="number">20</span></span><br><span class="line">GETTER_CYCLE = <span class="number">20</span></span><br><span class="line">TESTER_ENABLED = <span class="literal">True</span></span><br><span class="line">GETTER_ENABLED = <span class="literal">True</span></span><br><span class="line">API_ENABLED = <span class="literal">True</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">from</span> multiprocessing <span class="keyword">import</span> Process</span><br><span class="line"><span class="keyword">from</span> api <span class="keyword">import</span> app</span><br><span class="line"><span class="keyword">from</span> getter <span class="keyword">import</span> Getter</span><br><span class="line"><span class="keyword">from</span> tester <span class="keyword">import</span> Tester</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Scheduler</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">schedule_tester</span><span class="params">(self, cycle=TESTER_CYCLE)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        定时测试代理</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        tester = Tester()</span><br><span class="line">        <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">            print(<span class="string">'测试器开始运行'</span>)</span><br><span class="line">            tester.run()</span><br><span class="line">            time.sleep(cycle)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">schedule_getter</span><span class="params">(self, cycle=GETTER_CYCLE)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        定时获取代理</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        getter = Getter()</span><br><span class="line">        <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">            print(<span class="string">'开始抓取代理'</span>)</span><br><span class="line">            getter.run()</span><br><span class="line">            time.sleep(cycle)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">schedule_api</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        开启API</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        app.run(API_HOST, API_PORT)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">run</span><span class="params">(self)</span>:</span></span><br><span class="line">        print(<span class="string">'代理池开始运行'</span>)</span><br><span class="line">        <span class="keyword">if</span> TESTER_ENABLED:</span><br><span class="line">            tester_process = Process(target=self.schedule_tester)</span><br><span class="line">            tester_process.start()</span><br><span class="line"></span><br><span class="line">        <span class="keyword">if</span> GETTER_ENABLED:</span><br><span class="line">            getter_process = Process(target=self.schedule_getter)</span><br><span class="line">            getter_process.start()</span><br><span class="line"></span><br><span class="line">        <span class="keyword">if</span> API_ENABLED:</span><br><span class="line">            api_process = Process(target=self.schedule_api)</span><br><span class="line">            api_process.start()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里还有三个常量，TESTER_ENABLED、GETTER_ENABLED、API_ENABLED 都是布尔类型，True 或者 False。标明了测试模块、获取模块、接口模块的开关，如果为 True，则代表模块开启。 启动入口是 run() 方法，其分别判断了三个模块的开关，如果开启的话，就新建一个 Process 进程，设置好启动目标，然后调用 start() 方法运行，这样三个进程就可以并行执行，互不干扰。 三个调度方法结构也非常清晰，比如 schedule_tester() 方法，这是用来调度测试模块的方法，首先声明一个 Tester 对象，然后进入死循环不断循环调用其 run() 方法，执行完一轮之后就休眠一段时间，休眠结束之后重新再执行。在这里休眠时间也定义为一个常量，如 20 秒，这样就会每隔 20 秒进行一次代理检测。 最后整个代理池的运行只需要调用 Scheduler 的 run() 方法即可启动。 以上便是整个代理池的架构和相应实现逻辑。</p>
                  <h3 id="5-运行"><a href="#5-运行" class="headerlink" title="5. 运行"></a>5. 运行</h3>
                  <p>接下来我们将代码整合一下，将代理运行起来，运行之后的输出结果如图 9-2 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060356.png" alt=""> 图 9-2 运行结果 以上是代理池的控制台输出，可以看到可用代理设置为 100，不可用代理分数减 1。 接下来我们再打开浏览器，当前配置了运行在 5555 端口，所以打开：<a href="http://127.0.0.1:5555" target="_blank" rel="noopener">http://127.0.0.1:5555</a>，即可看到其首页，如图 9-3 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060403.jpg" alt=""> 图 9-3 首页页面 再访问：<a href="http://127.0.0.1:5555/random" target="_blank" rel="noopener">http://127.0.0.1:5555/random</a>，即可获取随机可用代理，如图 9-4 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-060410.jpg" alt=""> 图 9-4 获取代理页面 所以后面我们只需要访问此接口即可获取一个随机可用代理，非常方便。 获取代理的代码如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"></span><br><span class="line">PROXY_POOL_URL = <span class="string">'http://localhost:5555/random'</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_proxy</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="keyword">try</span>:</span><br><span class="line">        response = requests.get(PROXY_POOL_URL)</span><br><span class="line">        <span class="keyword">if</span> response.status_code == <span class="number">200</span>:</span><br><span class="line">            <span class="keyword">return</span> response.text</span><br><span class="line">    <span class="keyword">except</span> ConnectionError:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">None</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>获取下来之后便是一个字符串类型的代理，可以按照上一节所示的方法设置代理，如 Requests 的使用方法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line"></span><br><span class="line">proxy = get_proxy()</span><br><span class="line">proxies = &#123;</span><br><span class="line">    <span class="string">'http'</span>: <span class="string">'http://'</span> + proxy,</span><br><span class="line">    <span class="string">'https'</span>: <span class="string">'https://'</span> + proxy,</span><br><span class="line">&#125;</span><br><span class="line">try:</span><br><span class="line">    response = requests.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>, <span class="attribute">proxies</span>=proxies)</span><br><span class="line">    <span class="builtin-name">print</span>(response.text)</span><br><span class="line">except requests.exceptions.ConnectionError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(<span class="string">'Error'</span>, e.args)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>有了代理池之后，我们再取出代理即可有效防止IP被封禁的情况。</p>
                  <h3 id="6-本节代码"><a href="#6-本节代码" class="headerlink" title="6. 本节代码"></a>6. 本节代码</h3>
                  <p>本节代码地址为：<a href="https://github.com/Python3WebSpider/ProxyPool" target="_blank" rel="noopener">https://github.com/Python3WebSpider/ProxyPool</a>。</p>
                  <h3 id="7-结语"><a href="#7-结语" class="headerlink" title="7. 结语"></a>7. 结语</h3>
                  <p>本节我们实现了一个比较高效的代理池来获取随机可用的代理，整个内容比较多，需要好好理解一下。 在后文我们会利用代理池来实现数据的抓取。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 14:05:09" itemprop="dateCreated datePublished" datetime="2019-08-02T14:05:09+08:00">2019-08-02</time>
                </span>
                <span id="/7048.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 9.2-代理池的维护" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>15k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>13 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7045.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7045.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 9.1-代理的设置</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>在前面我们介绍了多种请求库，如 Requests、Urllib、Selenium 等。我们接下来首先贴近实战，了解一下代理怎么使用，为后面了解代理池、ADSL 拨号代理的使用打下基础。 下面我们来梳理一下这些库的代理的设置方法。</p>
                  <h3 id="1-获取代理"><a href="#1-获取代理" class="headerlink" title="1. 获取代理"></a>1. 获取代理</h3>
                  <p>在做测试之前，我们需要先获取一个可用代理，搜索引擎搜索“代理”关键字，就可以看到有许多代理服务网站，在网站上会有很多免费代理，比如西刺：<a href="http://www.xicidaili.com/" target="_blank" rel="noopener">http://www.xicidaili.com/</a>，这里列出了很多免费代理，但是这些免费代理大多数情况下都是不好用的，所以比较靠谱的方法是购买付费代理，很多网站都有售卖，数量不用多，买一个稳定可用的即可，可以自行选购。 或者如果我们本机有相关代理软件的话，软件一般会在本机创建 HTTP 或 SOCKS 代理服务，直接使用此代理也可以。 在这里我的本机安装了一部代理软件，它会在本地 9743 端口上创建 HTTP 代理服务，也就是代理为 127.0.0.1:9743，另外还会在 9742 端口创建 SOCKS 代理服务，也就是代理为 127.0.0.1:9742，我只要设置了这个代理就可以成功将本机 IP 切换到代理软件连接的服务器的 IP了。 所以本节下面的示例里我使用上述代理来演示其设置方法，你可以自行替换成自己的可用代理，设置代理后测试的网址是：<a href="http://httpbin.org/get" target="_blank" rel="noopener">http://httpbin.org/get</a>，访问该站点可以得到请求的一些相关信息，其中 origin 字段就是客户端的 IP，我们可以根据它来判断代理是否设置成功，也就是是否成功伪装了IP。 下面我们来看下各个库的代理设置方式。</p>
                  <h3 id="2-Urllib"><a href="#2-Urllib" class="headerlink" title="2. Urllib"></a>2. Urllib</h3>
                  <p>首先我们以最基础的 Urllib 为例，来看一下代理的设置方法，代码如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> urllib.<span class="builtin-name">error</span> import URLError</span><br><span class="line"><span class="keyword">from</span> urllib.request import ProxyHandler, build_opener</span><br><span class="line"></span><br><span class="line">proxy = <span class="string">'127.0.0.1:9743'</span></span><br><span class="line">proxy_handler = ProxyHandler(&#123;</span><br><span class="line">    <span class="string">'http'</span>: <span class="string">'http://'</span> + proxy,</span><br><span class="line">    <span class="string">'https'</span>: <span class="string">'https://'</span> + proxy</span><br><span class="line">&#125;)</span><br><span class="line">opener = build_opener(proxy_handler)</span><br><span class="line">try:</span><br><span class="line">    response = opener.open(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line">    <span class="builtin-name">print</span>(response.read().decode(<span class="string">'utf-8'</span>))</span><br><span class="line">except URLError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(e.reason)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果如下：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"identity"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"Python-urllib/3.6"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们需要借助于 ProxyHandler 设置代理，参数是字典类型，键名为协议类型，键值是代理，注意此处代理前面需要加上协议，即 http 或者 https，此处设置了 http 和 https 两种代理，当我们请求的链接是 http 协议的时候，它会调用 http 代理，当请求的链接是 https 协议的时候，它会调用https代理，所以此处生效的代理是：<a href="http://127.0.0.1:9743" target="_blank" rel="noopener">http://127.0.0.1:9743</a>。 创建完 ProxyHandler 对象之后，我们需要利用 build_opener() 方法传入该对象来创建一个 Opener，这样就相当于此 Opener 已经设置好代理了，接下来直接调用它的 open() 方法即可使用此代理访问我们所想要的链接。 运行输出结果是一个 Json，它有一个字段 origin，标明了客户端的 IP，此处的 IP 验证一下，确实为代理的 IP，而并不是我们真实的 IP，所以这样我们就成功设置好代理，并可以隐藏真实 IP 了。 如果遇到需要认证的代理，我们可以用如下的方法设置：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> urllib.<span class="builtin-name">error</span> import URLError</span><br><span class="line"><span class="keyword">from</span> urllib.request import ProxyHandler, build_opener</span><br><span class="line"></span><br><span class="line">proxy = <span class="string">'username:password@127.0.0.1:9743'</span></span><br><span class="line">proxy_handler = ProxyHandler(&#123;</span><br><span class="line">    <span class="string">'http'</span>: <span class="string">'http://'</span> + proxy,</span><br><span class="line">    <span class="string">'https'</span>: <span class="string">'https://'</span> + proxy</span><br><span class="line">&#125;)</span><br><span class="line">opener = build_opener(proxy_handler)</span><br><span class="line">try:</span><br><span class="line">    response = opener.open(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line">    <span class="builtin-name">print</span>(response.read().decode(<span class="string">'utf-8'</span>))</span><br><span class="line">except URLError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(e.reason)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里改变的只是 proxy 变量，只需要在代理前面加入代理认证的用户名密码即可，其中 username 就是用户名，password 为密码，例如 username 为foo，密码为 bar，那么代理就是 foo:bar@127.0.0.1:9743。 如果代理是 SOCKS5 类型，那么可以用如下方式设置代理：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import socks</span><br><span class="line">import socket</span><br><span class="line"><span class="keyword">from</span> urllib import request</span><br><span class="line"><span class="keyword">from</span> urllib.<span class="builtin-name">error</span> import URLError</span><br><span class="line"></span><br><span class="line">socks.set_default_proxy(socks.SOCKS5, <span class="string">'127.0.0.1'</span>, 9742)</span><br><span class="line">socket.socket = socks.socksocket</span><br><span class="line">try:</span><br><span class="line">    response = request.urlopen(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line">    <span class="builtin-name">print</span>(response.read().decode(<span class="string">'utf-8'</span>))</span><br><span class="line">except URLError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(e.reason)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>此处需要一个 Socks 模块，可以通过如下命令安装：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">pip3 <span class="keyword">install</span> PySocks</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>本地我有一个 SOCKS5 代理，运行在 9742 端口，运行成功之后和上文 HTTP 代理输出结果是一样的：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"identity"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"Python-urllib/3.6"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>结果的 origin 字段同样为代理的 IP，设置代理成功。</p>
                  <h3 id="3-Requests"><a href="#3-Requests" class="headerlink" title="3. Requests"></a>3. Requests</h3>
                  <p>对于 Requests 来说，代理设置更加简单，我们只需要传入 proxies 参数即可。 还是以上例中的代理为例，我们来看下 Requests 的代理的设置：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line"></span><br><span class="line">proxy = <span class="string">'127.0.0.1:9743'</span></span><br><span class="line">proxies = &#123;</span><br><span class="line">    <span class="string">'http'</span>: <span class="string">'http://'</span> + proxy,</span><br><span class="line">    <span class="string">'https'</span>: <span class="string">'https://'</span> + proxy,</span><br><span class="line">&#125;</span><br><span class="line">try:</span><br><span class="line">    response = requests.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>, <span class="attribute">proxies</span>=proxies)</span><br><span class="line">    <span class="builtin-name">print</span>(response.text)</span><br><span class="line">except requests.exceptions.ConnectionError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(<span class="string">'Error'</span>, e.args)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept"</span>: <span class="string">"*/*"</span>, </span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"gzip, deflate"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"python-requests/2.18.1"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以发现 Requests 的代理设置比 Urllib 简单很多，只需要构造代理字典即可，然后通过 proxies 参数即可设置代理，不需要重新构建 Opener。 可以发现其运行结果的 origin 也是代理的 IP，证明代理已经设置成功。 如果代理需要认证，同样在代理的前面加上用户名密码即可，代理的写法就变成：</p>
                  <figure class="highlight ini">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">proxy</span> = <span class="string">'username:password@127.0.0.1:9743'</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>和 Urllib 一样，只需要将 username 和 password 替换即可。 如果需要使用 SOCKS5 代理，则可以使用如下方式：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line"></span><br><span class="line">proxy = <span class="string">'127.0.0.1:9742'</span></span><br><span class="line">proxies = &#123;</span><br><span class="line">    <span class="string">'http'</span>: <span class="string">'socks5://'</span> + proxy,</span><br><span class="line">    <span class="string">'https'</span>: <span class="string">'socks5://'</span> + proxy</span><br><span class="line">&#125;</span><br><span class="line">try:</span><br><span class="line">    response = requests.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>, <span class="attribute">proxies</span>=proxies)</span><br><span class="line">    <span class="builtin-name">print</span>(response.text)</span><br><span class="line">except requests.exceptions.ConnectionError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(<span class="string">'Error'</span>, e.args)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里需要额外安装一个 Socks 模块，命令如下：</p>
                  <figure class="highlight cmake">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">pip3 <span class="keyword">install</span> <span class="string">"requests[socks]"</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果是完全相同的：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept"</span>: <span class="string">"*/*"</span>, </span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"gzip, deflate"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"python-requests/2.18.1"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>另外还有一种设置方式，和 Urllib 中的方法相同，使用 socks 模块，也需要像上文一样安装该库，设置方法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">import requests</span><br><span class="line">import socks</span><br><span class="line">import socket</span><br><span class="line"></span><br><span class="line">socks.set_default_proxy(socks.SOCKS5, <span class="string">'127.0.0.1'</span>, 9742)</span><br><span class="line">socket.socket = socks.socksocket</span><br><span class="line">try:</span><br><span class="line">    response = requests.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line">    <span class="builtin-name">print</span>(response.text)</span><br><span class="line">except requests.exceptions.ConnectionError as e:</span><br><span class="line">    <span class="builtin-name">print</span>(<span class="string">'Error'</span>, e.args)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样也可以设置 SOCKS5 代理，运行结果完全相同，相比第一种方法，此方法是全局设置，不同情况可以选用不同的方法。</p>
                  <h3 id="4-Selenium"><a href="#4-Selenium" class="headerlink" title="4. Selenium"></a>4. Selenium</h3>
                  <p>Selenium 同样也可以设置代理，在这里分两种介绍，一个是有界面浏览器，以 Chrome 为例介绍，另一种是无界面浏览器，以 PhantomJS 为例介绍。</p>
                  <h4 id="Chrome"><a href="#Chrome" class="headerlink" title="Chrome"></a>Chrome</h4>
                  <p>对于 Chrome 来说，用 Selenium 设置代理的方法也非常简单，设置方法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> selenium import webdriver</span><br><span class="line"></span><br><span class="line">proxy = <span class="string">'127.0.0.1:9743'</span></span><br><span class="line">chrome_options = webdriver.ChromeOptions()</span><br><span class="line">chrome_options.add_argument(<span class="string">'--proxy-server=http://'</span> + proxy)</span><br><span class="line">browser = webdriver.Chrome(<span class="attribute">chrome_options</span>=chrome_options)</span><br><span class="line">browser.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们通过 ChromeOptions 来设置代理，在创建 Chrome 对象的时候通过 chrome_options 参数传递即可。 这样在运行之后便会弹出一个 Chrome 浏览器，访问目标链接之后输出结果如下：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept"</span>: <span class="string">"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"</span>, </span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"gzip, deflate"</span>, </span><br><span class="line">    <span class="attr">"Accept-Language"</span>: <span class="string">"zh-CN,zh;q=0.8"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"Upgrade-Insecure-Requests"</span>: <span class="string">"1"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>可以看到 origin 同样为代理 IP 的地址，代理设置成功。 如果代理是认证代理，则设置方法相对比较麻烦，方法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> selenium import webdriver</span><br><span class="line"><span class="keyword">from</span> selenium.webdriver.chrome.options import Options</span><br><span class="line">import zipfile</span><br><span class="line"></span><br><span class="line">ip = <span class="string">'127.0.0.1'</span></span><br><span class="line">port = 9743</span><br><span class="line">username = <span class="string">'foo'</span></span><br><span class="line">password = <span class="string">'bar'</span></span><br><span class="line"></span><br><span class="line">manifest_json = <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">&#123;</span></span><br><span class="line"><span class="string">    "</span>version<span class="string">": "</span>1.0.0<span class="string">",</span></span><br><span class="line"><span class="string">    "</span>manifest_version<span class="string">": 2,</span></span><br><span class="line"><span class="string">    "</span>name<span class="string">": "</span>Chrome Proxy<span class="string">",</span></span><br><span class="line"><span class="string">    "</span>permissions<span class="string">": [</span></span><br><span class="line"><span class="string">        "</span>proxy<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>tabs<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>unlimitedStorage<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>storage<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>&lt;all_urls&gt;<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>webRequest<span class="string">",</span></span><br><span class="line"><span class="string">        "</span>webRequestBlocking<span class="string">"</span></span><br><span class="line"><span class="string">    ],</span></span><br><span class="line"><span class="string">    "</span>background<span class="string">": &#123;</span></span><br><span class="line"><span class="string">        "</span>scripts<span class="string">": ["</span>background.js<span class="string">"]</span></span><br><span class="line"><span class="string">    &#125;</span></span><br><span class="line"><span class="string">&#125;</span></span><br><span class="line"><span class="string">"</span><span class="string">""</span></span><br><span class="line"></span><br><span class="line">background_js = <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">var config = &#123;</span></span><br><span class="line"><span class="string">        mode: "</span>fixed_servers<span class="string">",</span></span><br><span class="line"><span class="string">        rules: &#123;</span></span><br><span class="line"><span class="string">          singleProxy: &#123;</span></span><br><span class="line"><span class="string">            scheme: "</span>http<span class="string">",</span></span><br><span class="line"><span class="string">            host: "</span>%(ip)s<span class="string">",</span></span><br><span class="line"><span class="string">            port: %(port)s</span></span><br><span class="line"><span class="string">          &#125;</span></span><br><span class="line"><span class="string">        &#125;</span></span><br><span class="line"><span class="string">      &#125;</span></span><br><span class="line"><span class="string"></span></span><br><span class="line"><span class="string">chrome.proxy.settings.set(&#123;value: config, scope: "</span>regular<span class="string">"&#125;, function() &#123;&#125;);</span></span><br><span class="line"><span class="string"></span></span><br><span class="line"><span class="string">function callbackFn(details) &#123;</span></span><br><span class="line"><span class="string">    return &#123;</span></span><br><span class="line"><span class="string">        authCredentials: &#123;</span></span><br><span class="line"><span class="string">            username: "</span>%(username)s<span class="string">",</span></span><br><span class="line"><span class="string">            password: "</span>%(password)s<span class="string">"</span></span><br><span class="line"><span class="string">        &#125;</span></span><br><span class="line"><span class="string">    &#125;</span></span><br><span class="line"><span class="string">&#125;</span></span><br><span class="line"><span class="string"></span></span><br><span class="line"><span class="string">chrome.webRequest.onAuthRequired.addListener(</span></span><br><span class="line"><span class="string">            callbackFn,</span></span><br><span class="line"><span class="string">            &#123;urls: ["</span>&lt;all_urls&gt;<span class="string">"]&#125;,</span></span><br><span class="line"><span class="string">            ['blocking']</span></span><br><span class="line"><span class="string">)</span></span><br><span class="line"><span class="string">"</span><span class="string">""</span> % &#123;<span class="string">'ip'</span>: ip, <span class="string">'port'</span>: port, <span class="string">'username'</span>: username, <span class="string">'password'</span>: password&#125;</span><br><span class="line"></span><br><span class="line">plugin_file = <span class="string">'proxy_auth_plugin.zip'</span></span><br><span class="line">with zipfile.ZipFile(plugin_file, <span class="string">'w'</span>) as zp:</span><br><span class="line">    zp.writestr(<span class="string">"manifest.json"</span>, manifest_json)</span><br><span class="line">    zp.writestr(<span class="string">"background.js"</span>, background_js)</span><br><span class="line">chrome_options = Options()</span><br><span class="line">chrome_options.add_argument(<span class="string">"--start-maximized"</span>)</span><br><span class="line">chrome_options.add_extension(plugin_file)</span><br><span class="line">browser = webdriver.Chrome(<span class="attribute">chrome_options</span>=chrome_options)</span><br><span class="line">browser.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里需要在本地创建一个 manifest.json 配置文件和 background.js 脚本来设置认证代理，运行之后本地会生成一个 proxy_auth_plugin.zip 文件保存配置。 运行结果和上例一致，origin 同样为代理 IP。</p>
                  <h4 id="PhantomJS"><a href="#PhantomJS" class="headerlink" title="PhantomJS"></a>PhantomJS</h4>
                  <p>对于 PhantomJS，代理设置方法可以借助于 service_args 参数，也就是命令行参数，代理设置方法如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> selenium import webdriver</span><br><span class="line"></span><br><span class="line">service_args = [</span><br><span class="line">    <span class="string">'--proxy=127.0.0.1:9743'</span>,</span><br><span class="line">    <span class="string">'--proxy-type=http'</span></span><br><span class="line">]</span><br><span class="line">browser = webdriver.PhantomJS(<span class="attribute">service_args</span>=service_args)</span><br><span class="line">browser.<span class="builtin-name">get</span>(<span class="string">'http://httpbin.org/get'</span>)</span><br><span class="line"><span class="builtin-name">print</span>(browser.page_source)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们只需要使用 service_args 参数，将命令行的一些参数定义为列表，在初始化的时候传递即可。 运行结果：</p>
                  <figure class="highlight json">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;</span><br><span class="line">  <span class="attr">"args"</span>: &#123;&#125;, </span><br><span class="line">  <span class="attr">"headers"</span>: &#123;</span><br><span class="line">    <span class="attr">"Accept"</span>: <span class="string">"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"</span>, </span><br><span class="line">    <span class="attr">"Accept-Encoding"</span>: <span class="string">"gzip, deflate"</span>, </span><br><span class="line">    <span class="attr">"Accept-Language"</span>: <span class="string">"zh-CN,en,*"</span>, </span><br><span class="line">    <span class="attr">"Connection"</span>: <span class="string">"close"</span>, </span><br><span class="line">    <span class="attr">"Host"</span>: <span class="string">"httpbin.org"</span>, </span><br><span class="line">    <span class="attr">"User-Agent"</span>: <span class="string">"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.0 Safari/538.1"</span></span><br><span class="line">  &#125;, </span><br><span class="line">  <span class="attr">"origin"</span>: <span class="string">"106.185.45.153"</span>, </span><br><span class="line">  <span class="attr">"url"</span>: <span class="string">"http://httpbin.org/get"</span></span><br><span class="line">&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>运行结果的 origin 同样为代理的 IP，设置代理成功。 如果需要认证，那么只需要再加入 —proxy-auth 选项即可，这样参数就改为：</p>
                  <figure class="highlight ini">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="attr">service_args</span> = [</span><br><span class="line">    <span class="string">'--proxy=127.0.0.1:9743'</span>,</span><br><span class="line">    <span class="string">'--proxy-type=http'</span>,</span><br><span class="line">    <span class="string">'--proxy-auth=username:password'</span></span><br><span class="line">]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>将 username 和 password 替换为认证所需的用户名和密码即可。</p>
                  <h3 id="5-本节代码"><a href="#5-本节代码" class="headerlink" title="5. 本节代码"></a>5. 本节代码</h3>
                  <p>本节代码地址为：<a href="https://github.com/Python3WebSpider/ProxySettings" target="_blank" rel="noopener">https://github.com/Python3WebSpider/ProxySettings</a>。</p>
                  <h3 id="6-结语"><a href="#6-结语" class="headerlink" title="6. 结语"></a>6. 结语</h3>
                  <p>本节介绍了前文所介绍的请求库的代理设置方法，稍作了解即可，后面我们会使用这些方法来搭建代理池和爬取网站，进一步加深印象。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 14:03:32" itemprop="dateCreated datePublished" datetime="2019-08-02T14:03:32+08:00">2019-08-02</time>
                </span>
                <span id="/7045.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 9.1-代理的设置" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>8.4k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>8 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7043.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7043.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 9-代理的使用</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>我们在做爬虫的过程中经常会遇到这样的情况，最初爬虫正常运行，正常抓取数据，一切看起来都是那么的美好，然而一杯茶的功夫可能就会出现错误，比如 403 Forbidden，这时候打开网页一看，可能会看到“您的 IP 访问频率太高”这样的提示，或者跳出一个验证码让我们输入，输入之后才可能解封，但是输入之后过一会儿就又这样了。 出现这样的现象的原因是网站采取了一些反爬虫的措施，比如服务器会检测某个 IP 在单位时间内的请求次数，如果超过了这个阈值，那么会直接拒绝服务，返回一些错误信息，这种情况可以称之为封 IP，于是乎就成功把我们的爬虫禁掉了。 既然服务器检测的是某个 IP 单位时间的请求次数，那么我们借助某种方式来伪装我们的 IP，让服务器识别不出是由我们本机发起的请求，不就可以成功防止封 IP 了吗？ 所以这时候代理就派上用场了，本章我们会详细介绍一下代理的基本知识及各种代理的使用方式，帮助爬虫脱离封 IP 的苦海。 本章接下来会介绍代理的设置、代理池的维护、付费代理的使用、ADSL拨号代理的搭建方法。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 14:02:17" itemprop="dateCreated datePublished" datetime="2019-08-02T14:02:17+08:00">2019-08-02</time>
                </span>
                <span id="/7043.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 9-代理的使用" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>438</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>1 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7041.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7041.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 8.4-微博宫格验证码的识别</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>本节我们来介绍一下新浪微博宫格验证码的识别，此验证码是一种新型交互式验证码，每个宫格之间会有一条指示连线，指示了我们应该的滑动轨迹，我们需要按照滑动轨迹依次从起始宫格一直滑动到终止宫格才可以完成验证，如图 8-24 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055351.png" alt=""> 图 8-24 验证码示例 鼠标滑动后的轨迹会以黄色的连线来标识，如图 8-25 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055407.jpg" alt=""> 图 8-25 滑动过程 我们可以访问新浪微博移动版登录页面就可以看到如上验证码，链接为：<a href="https://passport.weibo.cn/signin/login" target="_blank" rel="noopener">https://passport.weibo.cn/signin/login</a>，当然也不是每次都会出现验证码，一般当频繁登录或者账号存在安全风险的时候会出现。 接下来我们就来试着识别一下此类验证码。</p>
                  <h3 id="1-本节目标"><a href="#1-本节目标" class="headerlink" title="1. 本节目标"></a>1. 本节目标</h3>
                  <p>本节我们的目标是用程序来识别并通过微博宫格验证码的验证。</p>
                  <h3 id="2-准备工作"><a href="#2-准备工作" class="headerlink" title="2. 准备工作"></a>2. 准备工作</h3>
                  <p>本次我们使用的 Python 库是 Selenium，使用的浏览器为 Chrome，在此之前请确保已经正确安装好了 Selenium 库、Chrome浏览器并配置好了 ChromeDriver，相关流程可以参考第一章的说明。</p>
                  <h3 id="3-识别思路"><a href="#3-识别思路" class="headerlink" title="3. 识别思路"></a>3. 识别思路</h3>
                  <p>要识别首先要从探寻规律入手，那么首先我们找到的规律就是此验证码的四个宫格一定是有连线经过的，而且每一条连线上都会相应的指示箭头，连线的形状多样，如C型、Z型、X型等等，如图 8-26、8-27、8-28 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055423.png" alt=""> 图 8-26 C 型 <img src="https://qiniu.cuiqingcai.com/2019-08-02-055430.png" alt=""> 图 8-27 Z 型 <img src="https://qiniu.cuiqingcai.com/2019-08-02-055434.png" alt=""> 图 8-28 X 型 而同时我们发现同一种类型它的连线轨迹是相同的，唯一不同的就是连线的方向，如图 8-29、8-30 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055441.png" alt=""> 图 8-29 反向连线 <img src="https://qiniu.cuiqingcai.com/2019-08-02-055457.png" alt=""> 图 8-30 正向连线 这两种验证码的连线轨迹是相同的，但是由于连线上面的指示箭头不同导致滑动的宫格顺序就有所不同。 所以要完全识别滑动宫格顺序的话就需要具体识别出箭头的朝向，而观察一下整个验证码箭头朝向一共可能有 8 种，而且会出现在不同的位置，如果要写一个箭头方向识别算法的话需要都考虑到不同箭头所在的位置，我们需要找出各个位置的箭头的像素点坐标，同时识别算法还需要计算其像素点变化规律，这个工作量就变得比较大。 这时我们可以考虑用模板匹配的方法，模板匹配的意思就是将一些识别目标提前保存下来并做好标记，称作模板，在这里我们就可以获取验证码图片并做好拖动顺序的标记当做模板。在匹配的时候来对比要新识别的目标和每一个模板哪个是匹配的，如果找到匹配的模板，则被匹配到的模板就和新识别的目标是相同的，这样就成功识别出了要新识别的目标了。模板匹配在图像识别中也是非常常用的一种方法，实现简单而且易用性好。 模板匹配方法如果要效果好的话，我们必须要收集到足够多的模板才可以，而对于微博宫格验证码来说，宫格就 4 个，验证码的样式最多就是 4 <em> 3 </em> 2 * 1 = 24种，所以我们可以直接将所有模板都收集下来。 所以接下来我们需要考虑的就是用何种模板来进行匹配，是只匹配箭头还是匹配整个验证码全图呢？我们来权衡一下这两种方式的匹配精度和工作量：</p>
                  <ul>
                    <li>首先是精度问题。如果要匹配箭头的话，我们比对的目标只有几个像素点范围的箭头，而且我们需要精确知道各个箭头所在的像素点，一旦像素点有所偏差，那么匹配模板的时候会直接错位，导致匹配结果大打折扣。如果匹配全图，我们无需关心箭头所在位置，同时还有连线帮助辅助匹配，所以匹配精度上显然是全图匹配精度更高。</li>
                    <li>其次是工作量的问题。如果要匹配箭头的话，我们需要将所有不同朝向的箭头模板都保存下来，而相同位置箭头的朝向可能不一，相同朝向的箭头位置可能不一，这时候我们需要都算出各个箭头的位置并将其逐个截出来保存成模板，同时在匹配的时候也需要依次去探寻验证码对应位置是否有匹配模板。如果匹配全图的话，我们不需要关心每个箭头的位置和朝向，只需要将验证码全图保存下来即可，在匹配的时候也不需要再去计算箭头的位置，所以工作量上明显是匹配全图更小。</li>
                  </ul>
                  <p>所以综上考虑，我们选用全图匹配的方式来进行识别。 所以到此为止，我们就可以使用全图模板匹配的方法来识别这个宫格验证码了，找到匹配的模板之后，我们就可以得到事先为模板定义的拖动顺序，然后模拟拖动即可。</p>
                  <h3 id="4-获取模板"><a href="#4-获取模板" class="headerlink" title="4. 获取模板"></a>4. 获取模板</h3>
                  <p>在开始之前，我们需要做一下准备工作，先将 24 张验证码全图保存下来，保存工作难道需要手工来做吗？当然不是的，因为验证码是随机的，一共有 24 种，所以我们可以写一段程序来批量保存一些验证码图片，然后从中筛选出需要的图片就好了，代码如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br><span class="line">80</span><br><span class="line">81</span><br><span class="line">82</span><br><span class="line">83</span><br><span class="line">84</span><br><span class="line">85</span><br><span class="line">86</span><br><span class="line">87</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">import</span> time</span><br><span class="line"><span class="keyword">from</span> io <span class="keyword">import</span> BytesIO</span><br><span class="line"><span class="keyword">from</span> PIL <span class="keyword">import</span> Image</span><br><span class="line"><span class="keyword">from</span> selenium <span class="keyword">import</span> webdriver</span><br><span class="line"><span class="keyword">from</span> selenium.common.exceptions <span class="keyword">import</span> TimeoutException</span><br><span class="line"><span class="keyword">from</span> selenium.webdriver.common.by <span class="keyword">import</span> By</span><br><span class="line"><span class="keyword">from</span> selenium.webdriver.support.ui <span class="keyword">import</span> WebDriverWait</span><br><span class="line"><span class="keyword">from</span> selenium.webdriver.support <span class="keyword">import</span> expected_conditions <span class="keyword">as</span> EC</span><br><span class="line"></span><br><span class="line">USERNAME = <span class="string">''</span></span><br><span class="line">PASSWORD = <span class="string">''</span></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">CrackWeiboSlide</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self)</span>:</span></span><br><span class="line">        self.url = <span class="string">'https://passport.weibo.cn/signin/login'</span></span><br><span class="line">        self.browser = webdriver.Chrome()</span><br><span class="line">        self.wait = WebDriverWait(self.browser, <span class="number">20</span>)</span><br><span class="line">        self.username = USERNAME</span><br><span class="line">        self.password = PASSWORD</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__del__</span><span class="params">(self)</span>:</span></span><br><span class="line">        self.browser.close()</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">open</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        打开网页输入用户名密码并点击</span></span><br><span class="line"><span class="string">        :return: None</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        self.browser.get(self.url)</span><br><span class="line">        username = self.wait.until(EC.presence_of_element_located((By.ID, <span class="string">'loginName'</span>)))</span><br><span class="line">        password = self.wait.until(EC.presence_of_element_located((By.ID, <span class="string">'loginPassword'</span>)))</span><br><span class="line">        submit = self.wait.until(EC.element_to_be_clickable((By.ID, <span class="string">'loginAction'</span>)))</span><br><span class="line">        username.send_keys(self.username)</span><br><span class="line">        password.send_keys(self.password)</span><br><span class="line">        submit.click()</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">get_position</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        获取验证码位置</span></span><br><span class="line"><span class="string">        :return: 验证码位置元组</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        <span class="keyword">try</span>:</span><br><span class="line">            img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, <span class="string">'patt-shadow'</span>)))</span><br><span class="line">        <span class="keyword">except</span> TimeoutException:</span><br><span class="line">            print(<span class="string">'未出现验证码'</span>)</span><br><span class="line">            self.open()</span><br><span class="line">        time.sleep(<span class="number">2</span>)</span><br><span class="line">        location = img.location</span><br><span class="line">        size = img.size</span><br><span class="line">        top, bottom, left, right = location[<span class="string">'y'</span>], location[<span class="string">'y'</span>] + size[<span class="string">'height'</span>], location[<span class="string">'x'</span>], location[<span class="string">'x'</span>] + size[<span class="string">'width'</span>]</span><br><span class="line">        <span class="keyword">return</span> (top, bottom, left, right)</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">get_screenshot</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        获取网页截图</span></span><br><span class="line"><span class="string">        :return: 截图对象</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        screenshot = self.browser.get_screenshot_as_png()</span><br><span class="line">        screenshot = Image.open(BytesIO(screenshot))</span><br><span class="line">        <span class="keyword">return</span> screenshot</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">get_image</span><span class="params">(self, name=<span class="string">'captcha.png'</span>)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        获取验证码图片</span></span><br><span class="line"><span class="string">        :return: 图片对象</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        top, bottom, left, right = self.get_position()</span><br><span class="line">        print(<span class="string">'验证码位置'</span>, top, bottom, left, right)</span><br><span class="line">        screenshot = self.get_screenshot()</span><br><span class="line">        captcha = screenshot.crop((left, top, right, bottom))</span><br><span class="line">        captcha.save(name)</span><br><span class="line">        <span class="keyword">return</span> captcha</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">main</span><span class="params">(self)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        批量获取验证码</span></span><br><span class="line"><span class="string">        :return: 图片对象</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        count = <span class="number">0</span></span><br><span class="line">        <span class="keyword">while</span> <span class="literal">True</span>:</span><br><span class="line">            self.open()</span><br><span class="line">            self.get_image(str(count) + <span class="string">'.png'</span>)</span><br><span class="line">            count += <span class="number">1</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">'__main__'</span>:</span><br><span class="line">    crack = CrackWeiboSlide()</span><br><span class="line">    crack.main()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>其中这里需要将 USERNAME 和 PASSWORD 修改为自己微博的用户名密码，运行一段时间后便可以发现在本地多了很多以数字命名的验证码，如图 8-31 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055514.jpg" alt=""> 图 8-31 获取结果 在这里我们只需要挑选出不同的24张验证码图片并命名保存就好了，名称可以直接取作宫格的滑动的顺序，如某张验证码图片如图 8-32 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055525.png" alt=""> 图 8-32 验证码示例 我们将其命名为 4132.png 即可，也就是代表滑动顺序为 4-1-3-2，按照这样的规则，我们将验证码整理为如下 24 张图，如图 8-33 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055531.jpg" alt=""> 图 8-33 整理结果 如上的 24 张图就是我们的模板，接下来我们在识别的时候只需要遍历模板进行匹配即可。</p>
                  <h3 id="5-模板匹配"><a href="#5-模板匹配" class="headerlink" title="5. 模板匹配"></a>5. 模板匹配</h3>
                  <p>上面的代码已经实现了将验证码保存下来的功能，通过调用 get_image() 方法我们便可以得到验证码图片对象，得到验证码对象之后我们就需要对其进行模板匹配了，定义如下的方法进行匹配：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">from</span> os <span class="keyword">import</span> listdir</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">detect_image</span><span class="params">(self, image)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    匹配图片</span></span><br><span class="line"><span class="string">    :param image: 图片</span></span><br><span class="line"><span class="string">    :return: 拖动顺序</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="keyword">for</span> template_name <span class="keyword">in</span> listdir(TEMPLATES_FOLDER):</span><br><span class="line">        print(<span class="string">'正在匹配'</span>, template_name)</span><br><span class="line">        template = Image.open(TEMPLATES_FOLDER + template_name)</span><br><span class="line">        <span class="keyword">if</span> self.same_image(image, template):</span><br><span class="line">            <span class="comment"># 返回顺序</span></span><br><span class="line">            numbers = [int(number) <span class="keyword">for</span> number <span class="keyword">in</span> list(template_name.split(<span class="string">'.'</span>)[<span class="number">0</span>])]</span><br><span class="line">            print(<span class="string">'拖动顺序'</span>, numbers)</span><br><span class="line">            <span class="keyword">return</span> numbers</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里 TEMPLATES_FOLDER 就是模板所在的文件夹，在这里我们用 listdir() 方法将所有模板的文件名称获取出来，然后对其进行遍历，通过 same_image() 方法对验证码和模板进行比对，如果成功匹配，那么就将匹配到的模板文件名转为列表，如匹配到了 3124.png，则返回结果 [3, 1, 2, 4]。 比对的方法实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">is_pixel_equal</span><span class="params">(self, image1, image2, x, y)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    判断两个像素是否相同</span></span><br><span class="line"><span class="string">    :param image1: 图片1</span></span><br><span class="line"><span class="string">    :param image2: 图片2</span></span><br><span class="line"><span class="string">    :param x: 位置x</span></span><br><span class="line"><span class="string">    :param y: 位置y</span></span><br><span class="line"><span class="string">    :return: 像素是否相同</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="comment"># 取两个图片的像素点</span></span><br><span class="line">    pixel1 = image1.load()[x, y]</span><br><span class="line">    pixel2 = image2.load()[x, y]</span><br><span class="line">    threshold = <span class="number">20</span></span><br><span class="line">    <span class="keyword">if</span> abs(pixel1[<span class="number">0</span>] - pixel2[<span class="number">0</span>]) &lt; threshold <span class="keyword">and</span> abs(pixel1[<span class="number">1</span>] - pixel2[<span class="number">1</span>]) &lt; threshold <span class="keyword">and</span> abs(</span><br><span class="line">            pixel1[<span class="number">2</span>] - pixel2[<span class="number">2</span>]) &lt; threshold:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">True</span></span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">False</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">same_image</span><span class="params">(self, image, template)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    识别相似验证码</span></span><br><span class="line"><span class="string">    :param image: 待识别验证码</span></span><br><span class="line"><span class="string">    :param template: 模板</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="comment"># 相似度阈值</span></span><br><span class="line">    threshold = <span class="number">0.99</span></span><br><span class="line">    count = <span class="number">0</span></span><br><span class="line">    <span class="keyword">for</span> x <span class="keyword">in</span> range(image.width):</span><br><span class="line">        <span class="keyword">for</span> y <span class="keyword">in</span> range(image.height):</span><br><span class="line">            <span class="comment"># 判断像素是否相同</span></span><br><span class="line">            <span class="keyword">if</span> self.is_pixel_equal(image, template, x, y):</span><br><span class="line">                count += <span class="number">1</span></span><br><span class="line">    result = float(count) / (image.width * image.height)</span><br><span class="line">    <span class="keyword">if</span> result &gt; threshold:</span><br><span class="line">        print(<span class="string">'成功匹配'</span>)</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">True</span></span><br><span class="line">    <span class="keyword">return</span> <span class="literal">False</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里比对图片也是利用了遍历像素的方法，same_image() 方法接收两个参数，image 为待检测的验证码图片对象，template 是模板对象，由于二者大小是完全一致的，所以在这里我们遍历了图片的所有像素点，比对二者同一位置的像素点是否相同，如果相同就计数加 1，最后计算一下相同的像素点占总像素的比例，如果该比例超过一定阈值那就判定为图片完全相同，匹配成功。在这里设定阈值为 0.99，即如果二者有 0.99 以上的相似比则代表匹配成功。 这样通过上面的方法，依次匹配 24 个模板，如果验证码图片正常，总能找到一个匹配的模板，这样最后就可以得到宫格的滑动顺序了。</p>
                  <h3 id="6-模拟拖动"><a href="#6-模拟拖动" class="headerlink" title="6. 模拟拖动"></a>6. 模拟拖动</h3>
                  <p>得到了滑动顺序之后，我们接下来就是根据滑动顺序来拖动鼠标连接各个宫格了，方法实现如下：</p>
                  <figure class="highlight perl">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">def move(self, numbers):</span><br><span class="line">    <span class="string">""</span><span class="string">"</span></span><br><span class="line"><span class="string">    根据顺序拖动</span></span><br><span class="line"><span class="string">    :param numbers:</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    "</span><span class="string">""</span></span><br><span class="line">    <span class="comment"># 获得四个按点</span></span><br><span class="line">    circles = self.browser.find_elements_by_css_selector(<span class="string">'.patt-wrap .patt-circ'</span>)</span><br><span class="line">    dx = dy = <span class="number">0</span></span><br><span class="line">    <span class="keyword">for</span> <span class="keyword">index</span> in range(<span class="number">4</span>):</span><br><span class="line">        circle = circles[numbers[<span class="keyword">index</span>] - <span class="number">1</span>]</span><br><span class="line">        <span class="comment"># 如果是第一次循环</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">index</span> == <span class="number">0</span>:</span><br><span class="line">            <span class="comment"># 点击第一个按点</span></span><br><span class="line">            ActionChains(self.browser) </span><br><span class="line">                .move_to_element_with_offset(circle, circle.size[<span class="string">'width'</span>] / <span class="number">2</span>, circle.size[<span class="string">'height'</span>] / <span class="number">2</span>) </span><br><span class="line">                .click_and_hold().perform()</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="comment"># 小幅移动次数</span></span><br><span class="line">            <span class="keyword">times</span> = <span class="number">30</span></span><br><span class="line">            <span class="comment"># 拖动</span></span><br><span class="line">            <span class="keyword">for</span> i in range(<span class="keyword">times</span>):</span><br><span class="line">                ActionChains(self.browser).move_by_offset(dx / <span class="keyword">times</span>, dy / <span class="keyword">times</span>).perform()</span><br><span class="line">                time.sleep(<span class="number">1</span> / <span class="keyword">times</span>)</span><br><span class="line">        <span class="comment"># 如果是最后一次循环</span></span><br><span class="line">        <span class="keyword">if</span> <span class="keyword">index</span> == <span class="number">3</span>:</span><br><span class="line">            <span class="comment"># 松开鼠标</span></span><br><span class="line">            ActionChains(self.browser).release().perform()</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="comment"># 计算下一次偏移</span></span><br><span class="line">            dx = circles[numbers[<span class="keyword">index</span> + <span class="number">1</span>] - <span class="number">1</span>].location[<span class="string">'x'</span>] - circle.location[<span class="string">'x'</span>]</span><br><span class="line">            dy = circles[numbers[<span class="keyword">index</span> + <span class="number">1</span>] - <span class="number">1</span>].location[<span class="string">'y'</span>] - circle.location[<span class="string">'y'</span>]</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里方法接收的参数就是宫格的点按顺序，如 [3, 1, 2, 4]。首先我们利用 find_elements_by_css_selector() 方法获取到四个宫格元素，是一个列表形式，每个元素代表一个宫格，接下来我们遍历了宫格的点按顺序，再做一系列对应操作。 其中如果是第一个宫格，那就直接鼠标点击并保持动作，否则移动到下一个宫格。如果是最后一个宫格，那就松开鼠标，否则计算移动到下一个宫格的偏移量。 通过四次循环，我们便可以成功操作浏览器完成宫格验证码的拖拽填充，松开鼠标之后即可识别成功。 运行效果如图 8-34 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055449.png" alt=""> 图 8-34 运行效果 鼠标会慢慢的从起始位置移动到终止位置，最后一个宫格松开之后便完成了验证码的识别。 至此，微博宫格验证码的识别就全部完成了。 识别完成之后验证码窗口会自动关闭，接下来直接点击登录按钮即可完成微博登录。</p>
                  <h3 id="7-本节代码"><a href="#7-本节代码" class="headerlink" title="7. 本节代码"></a>7. 本节代码</h3>
                  <p>本节代码地址为：<a href="https://github.com/Python3WebSpider/CrackWeiboSlide" target="_blank" rel="noopener">https://github.com/Python3WebSpider/CrackWeiboSlide</a>。</p>
                  <h3 id="8-结语"><a href="#8-结语" class="headerlink" title="8. 结语"></a>8. 结语</h3>
                  <p>本节我们介绍了一种常用的模板匹配识别图片的方式来识别验证码，并模拟了鼠标拖拽动作来实现验证码的识别。如果遇到类似的验证码，可以采用同样的思路进行识别。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 13:56:40" itemprop="dateCreated datePublished" datetime="2019-08-02T13:56:40+08:00">2019-08-02</time>
                </span>
                <span id="/7041.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 8.4-微博宫格验证码的识别" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>7k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>6 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7039.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7039.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 8.3-点触点选验证码的识别</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>上一节我们实现了极验验证码的识别，但是除了极验其实还有另一种常见的且应用广泛的验证码，比较有代表性的就是点触验证码。 可能你对这个名字比较陌生，但是肯定见过类似的验证码，比如 12306，这就是一种典型的点触验证码，如图 8-18 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055135.jpg" alt=""> 图 8-18 12306 验证码 我们需要直接点击图中符合要求的图，如果所有答案均正确才会验证成功，如果有一个答案错误，验证就会失败，这种验证码就可以称之为点触验证码。 另外还有一个专门提供点触验证码服务的站点，叫做 TouClick，其官方网站为：<a href="https://www.touclick.com/" target="_blank" rel="noopener">https://www.touclick.com/</a>，本节就以它为例讲解一下此类验证码的识别过程。</p>
                  <h3 id="1-本节目标"><a href="#1-本节目标" class="headerlink" title="1. 本节目标"></a>1. 本节目标</h3>
                  <p>本节我们的目标是用程序来识别并通过点触验证码的验证。</p>
                  <h3 id="2-准备工作"><a href="#2-准备工作" class="headerlink" title="2. 准备工作"></a>2. 准备工作</h3>
                  <p>本次我们使用的 Python 库是 Selenium，使用的浏览器为 Chrome，在此之前请确保已经正确安装好了 Selenium 库、Chrome浏览器并配置好了 ChromeDriver，相关流程可以参考第一章的说明。</p>
                  <h3 id="3-了解点触验证码"><a href="#3-了解点触验证码" class="headerlink" title="3. 了解点触验证码"></a>3. 了解点触验证码</h3>
                  <p>TouClick 官方网站的验证码样式如图 8-19 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055138.jpg" alt=""> 图 8-19 验证码样式 和 12306 站点有相似之处，不过这次是点击图片中的文字，不是图片了，另外还有各种形形色色的点触验证码，其交互形式可能略有不同，但基本原理都是类似的。 接下来我们就来统一实现一下此类点触验证码的识别过程。</p>
                  <h3 id="4-识别思路"><a href="#4-识别思路" class="headerlink" title="4. 识别思路"></a>4. 识别思路</h3>
                  <p>此种验证码的如果依靠图像识别的话识别难度非常之大。 例如就 12306 来说，其识别难点有两个点，第一点是文字识别，如图 8-20 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055142.jpg" alt=""> 图 8-20 12306 验证码 如点击图中所有的漏斗，“漏斗”二字其实都经过变形、放缩、模糊处理了，如果要借助于前面我们讲的 OCR 技术来识别，识别的精准度会大打折扣，甚至得不到任何结果。第二点是图像的识别，我们需要将图像重新转化文字，可以借助于各种识图接口，可经我测试识别正确结果的准确率非常低，经常会出现匹配不正确或匹配不出结果的情况，而且图片本身的的清晰度也不够，所以识别难度会更大，更何况需要同时识别出八张图片的结果，且其中几个答案需要完全匹配正确才能验证通过，综合来看，此种方法基本是不可行的。 再拿 TouClick 来说，如图 8-21 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055147.jpg" alt=""> 图 8-21 验证码示例 我们需要从这幅图片中识别出植株二字，但是图片的背景或多或少会有干扰，导致 OCR 几乎不会识别出结果，有人会说，直接识别白色的文字不就好了吗？但是如果换一张验证码呢？如图 8-22 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055153.jpg" alt=""> 图 8-22 验证码示例 这张验证码图片的文字又变成了蓝色，而且还又有白色阴影，识别的难度又会大大增加。 那么此类验证码就没法解了吗？答案当然是有，靠什么？靠人。 靠人解决？那还要程序做什么？不要急，这里说的人并不是我们自己去解，在互联网上存在非常多的验证码服务平台，平台 7x24 小时提供验证码识别服务，一张图片几秒就会获得识别结果，准确率可达 90% 以上，但是就需要花点钱来购买服务了，毕竟平台都是需要盈利的，不过不用担心，识别一个验证码只需要几分钱。 在这里我个人比较推荐的一个平台是超级鹰，其官网为：<a href="https://www.chaojiying.com" target="_blank" rel="noopener">https://www.chaojiying.com</a>，非广告。 其提供的服务种类非常广泛，可识别的验证码类型非常多，其中就包括此类点触验证码。 另外超级鹰平台同样支持简单的图形验证码识别，如果 OCR 识别有难度，同样可以用本节相同的方法借助此平台来识别，下面是此平台提供的一些服务：</p>
                  <ul>
                    <li>英文数字，提供最多20位英文数字的混合识别</li>
                    <li>中文汉字，提供最多7个汉字的识别</li>
                    <li>纯英文，提供最多12位的英文的识别</li>
                    <li>纯数字，提供最多11位的数字的识别</li>
                    <li>任意特殊字符，提供不定长汉字英文数字、拼音首字母、计算题、成语混合、 集装箱号等字符的识别</li>
                    <li>坐标选择识别，如复杂计算题、选择题四选一、问答题、点击相同的字、物品、动物等返回多个坐标的识别</li>
                  </ul>
                  <p>具体如有变动以官网为准：<a href="https://www.chaojiying.com/price.html" target="_blank" rel="noopener">https://www.chaojiying.com/price.html</a>。 而本节我们需要解决的就是属于最后一类，坐标多选识别的情况，我们需要做的就是将验证码图片提交给平台，然后平台会返回识别结果在图片中的坐标位置，接下来我们再解析坐标模拟点击就好了。 原理非常简单，下面我们就来实际用程序来实验一下。</p>
                  <h3 id="5-注册账号"><a href="#5-注册账号" class="headerlink" title="5. 注册账号"></a>5. 注册账号</h3>
                  <p>在开始之前，我们需要先注册一个超级鹰账号并申请一个软件ID，注册页面链接为：<a href="https://www.chaojiying.com/user/reg/" target="_blank" rel="noopener">https://www.chaojiying.com/user/reg/</a>，注册完成之后还需要在后台开发商中心添加一个软件ID，最后一件事就是充值一些题分，充值多少可以根据价格和识别量自行决定。</p>
                  <h3 id="6-获取API"><a href="#6-获取API" class="headerlink" title="6. 获取API"></a>6. 获取API</h3>
                  <p>做好上面的准备工作之后我们就可以开始用程序来对接验证码的识别了。 首先我们可以到官方网站下载对应的 Python API，链接为：<a href="https://www.chaojiying.com/api-14.html" target="_blank" rel="noopener">https://www.chaojiying.com/api-14.html</a>，但是此 API 是Python2 版本的，是用 Requests 库来实现的，我们可以简单更改几个地方即可将其修改为 Python3 版本。 修改之后的API如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">from</span> hashlib <span class="keyword">import</span> md5</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">Chaojiying</span><span class="params">(object)</span>:</span></span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, username, password, soft_id)</span>:</span></span><br><span class="line">        self.username = username</span><br><span class="line">        self.password = md5(password.encode(<span class="string">'utf-8'</span>)).hexdigest()</span><br><span class="line">        self.soft_id = soft_id</span><br><span class="line">        self.base_params = &#123;</span><br><span class="line">            <span class="string">'user'</span>: self.username,</span><br><span class="line">            <span class="string">'pass2'</span>: self.password,</span><br><span class="line">            <span class="string">'softid'</span>: self.soft_id,</span><br><span class="line">        &#125;</span><br><span class="line">        self.headers = &#123;</span><br><span class="line">            <span class="string">'Connection'</span>: <span class="string">'Keep-Alive'</span>,</span><br><span class="line">            <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)'</span>,</span><br><span class="line">        &#125;</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">post_pic</span><span class="params">(self, im, codetype)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        im: 图片字节</span></span><br><span class="line"><span class="string">        codetype: 题目类型 参考 http://www.chaojiying.com/price.html</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        params = &#123;</span><br><span class="line">            <span class="string">'codetype'</span>: codetype,</span><br><span class="line">        &#125;</span><br><span class="line">        params.update(self.base_params)</span><br><span class="line">        files = &#123;<span class="string">'userfile'</span>: (<span class="string">'ccc.jpg'</span>, im)&#125;</span><br><span class="line">        r = requests.post(<span class="string">'http://upload.chaojiying.net/Upload/Processing.php'</span>, data=params, files=files, headers=self.headers)</span><br><span class="line">        <span class="keyword">return</span> r.json()</span><br><span class="line"></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">report_error</span><span class="params">(self, im_id)</span>:</span></span><br><span class="line">        <span class="string">"""</span></span><br><span class="line"><span class="string">        im_id:报错题目的图片ID</span></span><br><span class="line"><span class="string">        """</span></span><br><span class="line">        params = &#123;</span><br><span class="line">            <span class="string">'id'</span>: im_id,</span><br><span class="line">        &#125;</span><br><span class="line">        params.update(self.base_params)</span><br><span class="line">        r = requests.post(<span class="string">'http://upload.chaojiying.net/Upload/ReportError.php'</span>, data=params, headers=self.headers)</span><br><span class="line">        <span class="keyword">return</span> r.json()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里定义了一个 Chaojiying 类，其构造函数接收三个参数，分别是超级鹰的用户名、密码以及软件ID，保存好以备使用。 接下来是最重要的一个方法叫做 post_pic()，这里需要传入图片对象和验证码的代号，该方法会将图片对象和相关信息发给超级鹰的后台进行识别，然后将识别成功的 Json 返回回来。 另一个方法叫做 report_error()，这个是发生错误的时候的回调，如果验证码识别错误，调用此方法会返还相应的题分。 接下来我们以 TouClick 的官网为例来进行演示点触验证码的识别过程，链接为：<a href="http://admin.touclick.com/" target="_blank" rel="noopener">http://admin.touclick.com/</a>，如果没有注册账号可以先注册一个。</p>
                  <h3 id="7-初始化"><a href="#7-初始化" class="headerlink" title="7. 初始化"></a>7. 初始化</h3>
                  <p>首先我们需要初始化一些变量，如 WebDriver、Chaojiying对象等等，代码实现如下：</p>
                  <figure class="highlight ruby">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">EMAIL = <span class="string">'cqc@cuiqingcai.com'</span></span><br><span class="line">PASSWORD = <span class="string">''</span></span><br><span class="line"><span class="comment"># 超级鹰用户名、密码、软件ID、验证码类型</span></span><br><span class="line">CHAOJIYING_USERNAME = <span class="string">'Germey'</span></span><br><span class="line">CHAOJIYING_PASSWORD = <span class="string">''</span></span><br><span class="line">CHAOJIYING_SOFT_ID = <span class="number">893590</span></span><br><span class="line">CHAOJIYING_KIND = <span class="number">9102</span></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">CrackTouClick</span>():</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(<span class="keyword">self</span>)</span></span><span class="symbol">:</span></span><br><span class="line">        <span class="keyword">self</span>.url = <span class="string">'http://admin.touclick.com/login.html'</span></span><br><span class="line">        <span class="keyword">self</span>.browser = webdriver.Chrome()</span><br><span class="line">        <span class="keyword">self</span>.wait = WebDriverWait(<span class="keyword">self</span>.browser, <span class="number">20</span>)</span><br><span class="line">        <span class="keyword">self</span>.email = EMAIL</span><br><span class="line">        <span class="keyword">self</span>.password = PASSWORD</span><br><span class="line">        <span class="keyword">self</span>.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这里的账号和密码请自行修改。</p>
                  <h3 id="8-获取验证码"><a href="#8-获取验证码" class="headerlink" title="8. 获取验证码"></a>8. 获取验证码</h3>
                  <p>接下来的第一步就是完善相关表单，然后模拟点击呼出验证码，此步非常简单，代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">open</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    打开网页输入用户名密码</span></span><br><span class="line"><span class="string">    :return: None</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    self.browser.get(self.url)</span><br><span class="line">    email = self.wait.until(EC.presence_of_element_located((By.ID, <span class="string">'email'</span>)))</span><br><span class="line">    password = self.wait.until(EC.presence_of_element_located((By.ID, <span class="string">'password'</span>)))</span><br><span class="line">    email.send_keys(self.email)</span><br><span class="line">    password.send_keys(self.password)</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_touclick_button</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取初始验证按钮</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, <span class="string">'touclick-hod-wrap'</span>)))</span><br><span class="line">    <span class="keyword">return</span> button</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里 open() 方法负责填写表单，get_touclick_button() 方法则是获取验证码按钮，随后触发点击即可。 接下来我们需要类似上一节极验验证码图像获取一样，首先获取验证码图片的位置和大小，随后从网页截图里面截取相应的验证码图片就好了。代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_touclick_element</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取验证图片对象</span></span><br><span class="line"><span class="string">    :return: 图片对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, <span class="string">'touclick-pub-content'</span>)))</span><br><span class="line">    <span class="keyword">return</span> element</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_position</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取验证码位置</span></span><br><span class="line"><span class="string">    :return: 验证码位置元组</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    element = self.get_touclick_element()</span><br><span class="line">    time.sleep(<span class="number">2</span>)</span><br><span class="line">    location = element.location</span><br><span class="line">    size = element.size</span><br><span class="line">    top, bottom, left, right = location[<span class="string">'y'</span>], location[<span class="string">'y'</span>] + size[<span class="string">'height'</span>], location[<span class="string">'x'</span>], location[<span class="string">'x'</span>] + size[</span><br><span class="line">        <span class="string">'width'</span>]</span><br><span class="line">    <span class="keyword">return</span> (top, bottom, left, right)</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_screenshot</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取网页截图</span></span><br><span class="line"><span class="string">    :return: 截图对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    screenshot = self.browser.get_screenshot_as_png()</span><br><span class="line">    screenshot = Image.open(BytesIO(screenshot))</span><br><span class="line">    <span class="keyword">return</span> screenshot</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_touclick_image</span><span class="params">(self, name=<span class="string">'captcha.png'</span>)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取验证码图片</span></span><br><span class="line"><span class="string">    :return: 图片对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    top, bottom, left, right = self.get_position()</span><br><span class="line">    print(<span class="string">'验证码位置'</span>, top, bottom, left, right)</span><br><span class="line">    screenshot = self.get_screenshot()</span><br><span class="line">    captcha = screenshot.crop((left, top, right, bottom))</span><br><span class="line">    <span class="keyword">return</span> captcha</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里 get_touclick_image() 方法即为从网页截图中截取对应的验证码图片，其中验证码图片的相对位置坐标由 get_position() 方法返回得到，最后我们得到的是一个 Image 对象。</p>
                  <h3 id="9-识别验证码"><a href="#9-识别验证码" class="headerlink" title="9. 识别验证码"></a>9. 识别验证码</h3>
                  <p>随后我们调用 Chaojiying 对象的 post_pic() 方法即可把图片发送给超级鹰后台，在这里发送的图像是字节流格式，代码实现如下：</p>
                  <figure class="highlight routeros">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">image = self.get_touclick_image()</span><br><span class="line">bytes_array = BytesIO()</span><br><span class="line">image.save(bytes_array, <span class="attribute">format</span>=<span class="string">'PNG'</span>)</span><br><span class="line"><span class="comment"># 识别验证码</span></span><br><span class="line">result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND)</span><br><span class="line"><span class="builtin-name">print</span>(result)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>这样运行之后 result 变量就是超级鹰后台的识别结果，可能运行需要等待几秒，毕竟后台还有人工来完成识别。 返回的结果是一个 Json，如果识别成功后一个典型的返回结果类似如下：</p>
                  <figure class="highlight 1c">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">&#123;'err_no': <span class="number">0</span>, 'err_str': 'OK', 'pic_id': '<span class="number">60020013809492</span><span class="number">0000</span>1', 'pic_str': '132,127|56,77', 'md5': '1f8e1d4bef8b<span class="number">1148</span>4cb1f1f<span class="number">34299865</span>b'&#125;</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>其中 pic_str 就是识别的文字的坐标，是以字符串形式返回的，每个坐标都以 | 分隔，所以接下来我们只需要将其解析之后再模拟点击即可，代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_points</span><span class="params">(self, captcha_result)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    解析识别结果</span></span><br><span class="line"><span class="string">    :param captcha_result: 识别结果</span></span><br><span class="line"><span class="string">    :return: 转化后的结果</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    groups = captcha_result.get(<span class="string">'pic_str'</span>).split(<span class="string">'|'</span>)</span><br><span class="line">    locations = [[int(number) <span class="keyword">for</span> number <span class="keyword">in</span> group.split(<span class="string">','</span>)] <span class="keyword">for</span> group <span class="keyword">in</span> groups]</span><br><span class="line">    <span class="keyword">return</span> locations</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">touch_click_words</span><span class="params">(self, locations)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    点击验证图片</span></span><br><span class="line"><span class="string">    :param locations: 点击位置</span></span><br><span class="line"><span class="string">    :return: None</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="keyword">for</span> location <span class="keyword">in</span> locations:</span><br><span class="line">        print(location)</span><br><span class="line">        ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(), location[<span class="number">0</span>], location[<span class="number">1</span>]).click().perform()</span><br><span class="line">        time.sleep(<span class="number">1</span>)</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们用 get_points() 方法将识别结果变成了列表的形式，最后 touch_click_words() 方法则通过调用 move_to_element_with_offset() 方法依次传入解析后的坐标，然后点击即可。 这样我们就可以模拟完成坐标的点选了，运行效果如图 8-23 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055241.jpg" alt=""> 图 8-23 点选效果 最后我们需要做的就是点击提交验证的按钮等待验证通过，再点击登录按钮即可成功登录，后续实现在此不再赘述。 这样我们就借助于在线验证码平台完成了点触验证码的识别，此种方法也是一种通用方法，用此方法来识别 12306 等验证码也是完全相同的原理。</p>
                  <h3 id="10-本节代码"><a href="#10-本节代码" class="headerlink" title="10. 本节代码"></a>10. 本节代码</h3>
                  <p>本节代码地址为：<a href="https://github.com/Python3WebSpider/CrackTouClick" target="_blank" rel="noopener">https://github.com/Python3WebSpider/CrackTouClick</a>。</p>
                  <h3 id="11-结语"><a href="#11-结语" class="headerlink" title="11. 结语"></a>11. 结语</h3>
                  <p>本节我们通过在线打码平台辅助完成了验证码的识别，这种识别方法非常强大，几乎任意的验证码都可以识别，如果遇到难题，借助于打码平台无疑是一个极佳的选择。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 13:53:42" itemprop="dateCreated datePublished" datetime="2019-08-02T13:53:42+08:00">2019-08-02</time>
                </span>
                <span id="/7039.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 8.3-点触点选验证码的识别" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>7k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>6 分钟</span>
                </span>
              </div>
            </article>
            <article itemscope itemtype="http://schema.org/Article" class="post-block index" lang="zh-CN">
              <link itemprop="mainEntityOfPage" href="https://cuiqingcai.com/7037.html">
              <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
                <meta itemprop="image" content="/images/avatar.png">
                <meta itemprop="name" content="崔庆才">
                <meta itemprop="description" content="崔庆才的个人站点，记录生活的瞬间，分享学习的心得。">
              </span>
              <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
                <meta itemprop="name" content="静觅">
              </span>
              <header class="post-header">
                <h2 class="post-title" itemprop="name headline">
                  <a class="label"> Python <i class="label-arrow"></i>
                  </a>
                  <a href="/7037.html" class="post-title-link" itemprop="url">[Python3网络爬虫开发实战] 8.2-极验滑动验证码的识别</a>
                </h2>
              </header>
              <div class="post-body" itemprop="articleBody">
                <div class="thumb">
                  <img itemprop="contentUrl" class="random">
                </div>
                <div class="excerpt">
                  <p>
                  <p>上节我们了解了图形验证码的识别，简单的图形验证码我们可以直接利用 Tesserocr 来识别，但是近几年又出现了一些新型验证码，如滑动验证码，比较有代表性的就是极验验证码，它需要拖动拼合滑块才可以完成验证，相对图形验证码来说识别难度上升了几个等级，本节来讲解下极验验证码的识别过程。</p>
                  <h3 id="1-本节目标"><a href="#1-本节目标" class="headerlink" title="1. 本节目标"></a>1. 本节目标</h3>
                  <p>本节我们的目标是用程序来识别并通过极验验证码的验证，其步骤有分析识别思路、识别缺口位置、生成滑块拖动路径，最后模拟实现滑块拼合通过验证。</p>
                  <h3 id="2-准备工作"><a href="#2-准备工作" class="headerlink" title="2. 准备工作"></a>2. 准备工作</h3>
                  <p>本次我们使用的 Python 库是 Selenium，使用的浏览器为 Chrome，在此之前请确保已经正确安装好了 Selenium 库、Chrome浏览器并配置好了 ChromeDriver，相关流程可以参考第一章的说明。</p>
                  <h3 id="3-了解极验验证码"><a href="#3-了解极验验证码" class="headerlink" title="3. 了解极验验证码"></a>3. 了解极验验证码</h3>
                  <p>极验验证码其官网为：<a href="http://www.geetest.com/" target="_blank" rel="noopener">http://www.geetest.com/</a>，它是一个专注于提供验证安全的系统，主要验证方式是拖动滑块拼合图像，若图像完全拼合，则验证成功，即可以成功提交表单，否则需要重新验证，样例如图8-5 和 8-6 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054905.jpg" alt=""> 图 8-5 验证码示例 <img src="https://qiniu.cuiqingcai.com/2019-08-02-054908.jpg" alt=""> 图 8-6 验证码示例 现在极验验证码已经更新到了 3.0 版本，截至 2017 年 7 月全球已有十六万家企业正在使用极验，每天服务响应超过四亿次，广泛应用于直播视频、金融服务、电子商务、游戏娱乐、政府企业等各大类型网站，下面是斗鱼、魅族的登录页面，可以看到其都对接了极验验证码，如图 8-7 和 8-8 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054908.jpg" alt=""> 图 8-7 斗鱼登录页面 <img src="https://qiniu.cuiqingcai.com/2019-08-02-054912.jpg" alt=""> 图 8-8 魅族登录页面</p>
                  <h3 id="4-极验验证码的特点"><a href="#4-极验验证码的特点" class="headerlink" title="4. 极验验证码的特点"></a>4. 极验验证码的特点</h3>
                  <p>这种验证码相较于图形验证码来说识别难度更大，极验验证码首先需要在前台验证通过，对于极验 3.0，我们首先需要点击按钮进行智能验证，如果验证不通过，则会弹出滑动验证的窗口，随后需要拖动滑块拼合图像进行验证，验证之后会生成三个加密参数，参数随后通过表单提交到后台，后台还会进行一次验证。 另外极验还增加了机器学习的方法来识别拖动轨迹，官方网站的安全防护说明如下：</p>
                  <ul>
                    <li>三角防护之防模拟</li>
                  </ul>
                  <p>恶意程序模仿人类行为轨迹对验证码进行识别。针对模拟，极验拥有超过 4000 万人机行为样本的海量数据。利用机器学习和神经网络构建线上线下的多重静态、动态防御模型。识别模拟轨迹，界定人机边界。</p>
                  <ul>
                    <li>三角防护之防伪造</li>
                  </ul>
                  <p>恶意程序通过伪造设备浏览器环境对验证码进行识别。针对伪造，极验利用设备基因技术。深度分析浏览器的实际性能来辨识伪造信息。同时根据伪造事件不断更新黑名单，大幅提高防伪造能力。</p>
                  <ul>
                    <li>三角防护之防暴力</li>
                  </ul>
                  <p>恶意程序短时间内进行密集的攻击，对验证码进行暴力识别 针对暴力，极验拥有多种验证形态，每一种验证形态都有利用神经网络生成的海量图库储备，每一张图片都是独一无二的，且图库不断更新，极大程度提高了暴力识别的成本。 另外极验的验证相对于普通验证方式更加方便，体验更加友好，其官方网站说明如下：</p>
                  <ul>
                    <li>点击一下，验证只需要 0.4 秒</li>
                  </ul>
                  <p>极验始终专注于去验证化实践，让验证环节不再打断产品本身的交互流程，最终达到优化用户体验和提高用户转化率的效果。</p>
                  <ul>
                    <li>全平台兼容，适用各种交互场景</li>
                  </ul>
                  <p>极验兼容所有主流浏览器甚至古老的IE6，也可以轻松应用在iOS和Android移动端平台，满足各种业务需求，保护网站资源不被滥用和盗取。</p>
                  <ul>
                    <li>面向未来，懂科技，更懂人性</li>
                  </ul>
                  <p>极验在保障安全同时不断致力于提升用户体验，精雕细琢的验证面板，流畅顺滑的验证动画效果，让验证过程不再枯燥乏味。 因此，相较于一般验证码，极验的验证安全性和易用性有了非常大的提高。</p>
                  <h3 id="5-识别思路"><a href="#5-识别思路" class="headerlink" title="5. 识别思路"></a>5. 识别思路</h3>
                  <p>但是对于应用了极验验证码的网站，识别并不是没有办法的。如果我们直接模拟表单提交的话，加密参数的构造是个问题，参数构造有问题服务端就会校验失败，所以在这里我们采用直接模拟浏览器动作的方式来完成验证，在 Python 中我们就可以使用 Selenium 来通过完全模拟人的行为的方式来完成验证，此验证成本相对于直接去识别加密算法容易不少。 首先我们找到一个带有极验验证的网站，最合适的当然为极验官方后台了，链接为：<a href="https://account.geetest.com/login" target="_blank" rel="noopener">https://account.geetest.com/login</a>，首先可以看到在登录按钮上方有一个极验验证按钮，如图 8-9 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054921.jpg" alt=""> 图 8-9 验证按钮 此按钮为智能验证按钮，点击一下即可智能验证，一般来说如果是同一个 Session，一小段时间内第二次登录便会直接通过验证，如果智能识别不通过，则会弹出滑动验证窗口，我们便需要拖动滑块来拼合图像完成二步验证，如图 8-10 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054919.jpg" alt=""> 图 8-10 拖动示例 验证成功后验证按钮便会变成如下状态，如图 8-11 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-54922.jpg" alt=""> 图 8-11 验证成功结果 接下来我们便可以进行表单提交了。 所以在这里我们要识别验证需要做的有三步：</p>
                  <ul>
                    <li>模拟点击验证按钮</li>
                    <li>识别滑动缺口的位置</li>
                    <li>模拟拖动滑块</li>
                  </ul>
                  <p>第一步操作是最简单的，我们可以直接用 Selenium 模拟点击按钮即可。 第二步操作识别缺口的位置比较关键，需要用到图像的相关处理方法，那缺口怎么找呢？首先来观察一下缺口的样子，如图 8-12 和 8-13 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054942.jpg" alt=""> 图 8-12 缺口示例 <img src="https://qiniu.cuiqingcai.com/2019-08-02-054944.jpg" alt=""> 图 8-13 缺口示例 可以看到缺口的四周边缘有明显的断裂边缘，而且边缘和边缘周围有明显的区别，我们可以实现一个边缘检测算法来找出缺口的位置。对于极验来说，我们可以利用和原图对比检测的方式来识别缺口的位置，因为在没有滑动滑块之前，缺口其实是没有呈现的，如图 8-14 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054940.jpg" alt=""> 图 8-14 初始状态 所以我们可以同时获取两张图片，设定一个对比阈值，然后遍历两张图片找出相同位置像素 RGB 差距超过此阈值的像素点位置，那么此位置就是缺口的位置。 第三步操作看似简单，但是其中的坑比较多，极验验证码增加了机器轨迹识别，匀速移动、随机速度移动等方法都是不行的，只有完全模拟人的移动轨迹才可以通过验证，而人的移动轨迹一般是先加速后减速的，这又涉及到物理学中加速度的相关问题，我们需要模拟这个过程才能成功。 有了基本的思路之后就让我们用程序来实现一下它的识别过程吧。</p>
                  <h3 id="6-初始化"><a href="#6-初始化" class="headerlink" title="6. 初始化"></a>6. 初始化</h3>
                  <p>首先这次我们选定的链接为：<a href="https://account.geetest.com/login" target="_blank" rel="noopener">https://account.geetest.com/login</a>，也就是极验的管理后台登录页面，在这里我们首先初始化一些配置，如 Selenium 对象的初始化及一些参数的配置：</p>
                  <figure class="highlight ruby">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">EMAIL = <span class="string">'test@test.com'</span></span><br><span class="line">PASSWORD = <span class="string">'123456'</span></span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">CrackGeetest</span>():</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(<span class="keyword">self</span>)</span></span><span class="symbol">:</span></span><br><span class="line">        <span class="keyword">self</span>.url = <span class="string">'https://account.geetest.com/login'</span></span><br><span class="line">        <span class="keyword">self</span>.browser = webdriver.Chrome()</span><br><span class="line">        <span class="keyword">self</span>.wait = WebDriverWait(<span class="keyword">self</span>.browser, <span class="number">20</span>)</span><br><span class="line">        <span class="keyword">self</span>.email = EMAIL</span><br><span class="line">        <span class="keyword">self</span>.password = PASSWORD</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>其中 EMAIL 和 PASSWORD 就是登录极验需要的用户名和密码，如果没有的话可以先注册一下。</p>
                  <h3 id="7-模拟点击"><a href="#7-模拟点击" class="headerlink" title="7. 模拟点击"></a>7. 模拟点击</h3>
                  <p>随后我们需要实现第一步的操作，也就是模拟点击初始的验证按钮，所以我们定义一个方法来获取这个按钮，利用显式等待的方法来实现：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_geetest_button</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取初始验证按钮</span></span><br><span class="line"><span class="string">    :return: 按钮对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, <span class="string">'geetest_radar_tip'</span>)))</span><br><span class="line">    <span class="keyword">return</span> button</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>获取之后就会获取一个 WebElement 对象，调用它的 click() 方法即可模拟点击，代码如下：</p>
                  <figure class="highlight armasm">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"># 点击验证按钮</span><br><span class="line"><span class="keyword">button </span>= <span class="keyword">self.get_geetest_button()</span></span><br><span class="line"><span class="keyword">button.click()</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>到这里我们第一步的工作就完成了。</p>
                  <h3 id="8-识别缺口"><a href="#8-识别缺口" class="headerlink" title="8. 识别缺口"></a>8. 识别缺口</h3>
                  <p>接下来我们需要识别缺口的位置，首先我们需要将前后的两张比对图片获取下来，然后比对二者的不一致的地方即为缺口。首先我们需要获取不带缺口的图片，利用 Selenium 选取图片元素，然后得到其所在位置和宽高，随后获取整个网页的截图，再从截图中裁切出来即可，代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_position</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取验证码位置</span></span><br><span class="line"><span class="string">    :return: 验证码位置元组</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, <span class="string">'geetest_canvas_img'</span>)))</span><br><span class="line">    time.sleep(<span class="number">2</span>)</span><br><span class="line">    location = img.location</span><br><span class="line">    size = img.size</span><br><span class="line">    top, bottom, left, right = location[<span class="string">'y'</span>], location[<span class="string">'y'</span>] + size[<span class="string">'height'</span>], location[<span class="string">'x'</span>], location[<span class="string">'x'</span>] + size[</span><br><span class="line">        <span class="string">'width'</span>]</span><br><span class="line">    <span class="keyword">return</span> (top, bottom, left, right)</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_geetest_image</span><span class="params">(self, name=<span class="string">'captcha.png'</span>)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取验证码图片</span></span><br><span class="line"><span class="string">    :return: 图片对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    top, bottom, left, right = self.get_position()</span><br><span class="line">    print(<span class="string">'验证码位置'</span>, top, bottom, left, right)</span><br><span class="line">    screenshot = self.get_screenshot()</span><br><span class="line">    captcha = screenshot.crop((left, top, right, bottom))</span><br><span class="line">    <span class="keyword">return</span> captcha</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里 get_position() 函数首先获取了图片对象，然后获取了它的位置和宽高，随后返回了其左上角和右下角的坐标。而 get_geetest_image() 方法则是获取了网页截图，然后调用了 crop() 方法将图片再裁切出来，返回的是 Image 对象。 随后我们需要获取第二张图片，也就是带缺口的图片，要使得图片出现缺口，我们只需要点击一下下方的滑块即可，触发这个动作之后，图片中的缺口就会显现，实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_slider</span><span class="params">(self)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取滑块</span></span><br><span class="line"><span class="string">    :return: 滑块对象</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, <span class="string">'geetest_slider_button'</span>)))</span><br><span class="line">    <span class="keyword">return</span> slider</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>利用 get_slider() 方法获取滑块对象，接下来调用其 click() 方法即可触发点击，缺口图片即可呈现：</p>
                  <figure class="highlight crystal">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="comment"># 点按呼出缺口</span></span><br><span class="line">slider = <span class="keyword">self</span>.get_slider()</span><br><span class="line">slider.click()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>随后还是调用 get_geetest_image() 方法将第二张图片获取下来即可。 到现在我们就已经得到了两张图片对象了，分别赋值给变量 image1 和 image2，接下来对比图片获取缺口即可。要对比图片的不同之处，我们在这里遍历图片的每个坐标点，获取两张图片对应像素点的 RGB 数据，然后判断二者的 RGB 数据差异，如果差距超过在一定范围内，那就代表两个像素相同，继续比对下一个像素点，如果差距超过一定范围，则判断像素点不同，当前位置即为缺口位置，代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">is_pixel_equal</span><span class="params">(self, image1, image2, x, y)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    判断两个像素是否相同</span></span><br><span class="line"><span class="string">    :param image1: 图片1</span></span><br><span class="line"><span class="string">    :param image2: 图片2</span></span><br><span class="line"><span class="string">    :param x: 位置x</span></span><br><span class="line"><span class="string">    :param y: 位置y</span></span><br><span class="line"><span class="string">    :return: 像素是否相同</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="comment"># 取两个图片的像素点</span></span><br><span class="line">    pixel1 = image1.load()[x, y]</span><br><span class="line">    pixel2 = image2.load()[x, y]</span><br><span class="line">    threshold = <span class="number">60</span></span><br><span class="line">    <span class="keyword">if</span> abs(pixel1[<span class="number">0</span>] - pixel2[<span class="number">0</span>]) &lt; threshold <span class="keyword">and</span> abs(pixel1[<span class="number">1</span>] - pixel2[<span class="number">1</span>]) &lt; threshold <span class="keyword">and</span> abs(</span><br><span class="line">            pixel1[<span class="number">2</span>] - pixel2[<span class="number">2</span>]) &lt; threshold:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">True</span></span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        <span class="keyword">return</span> <span class="literal">False</span></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_gap</span><span class="params">(self, image1, image2)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    获取缺口偏移量</span></span><br><span class="line"><span class="string">    :param image1: 不带缺口图片</span></span><br><span class="line"><span class="string">    :param image2: 带缺口图片</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    left = <span class="number">60</span></span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> range(left, image1.size[<span class="number">0</span>]):</span><br><span class="line">        <span class="keyword">for</span> j <span class="keyword">in</span> range(image1.size[<span class="number">1</span>]):</span><br><span class="line">            <span class="keyword">if</span> <span class="keyword">not</span> self.is_pixel_equal(image1, image2, i, j):</span><br><span class="line">                left = i</span><br><span class="line">                <span class="keyword">return</span> left</span><br><span class="line">    <span class="keyword">return</span> left</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>get_gap() 方法即为获取缺口位置的方法，此方法的参数为两张图片，一张为带缺口图片，另一张为不带缺口图片，在这里遍历两张图片的每个像素，然后利用 is_pixel_equal() 方法判断两张图片同一位置的像素是否相同，比对的时候比较了两张图 RGB 的绝对值是否均小于定义的阈值 threshold，如果均在阈值之内，则像素点相同，继续遍历，否则遇到不相同的像素点就是缺口的位置。 在这里比如两张对比图片如下，如图 8-15 和 8-16 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-054959.png" alt=""> 图 8-15 初始状态 <img src="https://qiniu.cuiqingcai.com/2019-08-02-055003.jpg" alt=""> 图 8-16 后续状态 两张图片其实有两处明显不同的地方，一个就是待拼合的滑块，一个就是缺口，但是滑块的位置会出现在左边位置，缺口会出现在与滑块同一水平线的位置，所以缺口一般会在滑块的右侧，所以要寻找缺口的话，我们直接从滑块右侧寻找即可，所以在遍历的时候我们直接设置了遍历的起始横坐标为 60，也就是在滑块的右侧开始识别，这样识别出的结果就是缺口的位置了。 到现在为止，我们就可以获取缺口的位置了，剩下最后一步模拟拖动就可以完成验证了。</p>
                  <h3 id="9-模拟拖动"><a href="#9-模拟拖动" class="headerlink" title="9. 模拟拖动"></a>9. 模拟拖动</h3>
                  <p>模拟拖动的这个过程说复杂并不复杂，只是其中的坑比较多。现在我们已经获取到了缺口的位置，接下来只需要调用拖动的相关函数将滑块拖动到对应位置不就好了吗？然而事实很残酷，如果匀速拖动，极验必然会识别出来这是程序的操作，因为人是无法做到完全匀速拖动的，极验利用机器学习模型筛选出此类数据，归类为机器操作，验证码识别失败。 随后我又尝试了分段模拟，将拖动过程划分几段，每段设置一个平均速度，同时速度围绕该平均速度小幅度随机抖动，同样无法完成验证。 最后尝试了完全模拟加速减速的过程通过了验证，在前段滑块需要做匀加速运动，后面需要做匀减速运动，在这里利用物理学的加速度公式即可完成。 设滑块滑动的加速度用 a 来表示，当前速度用 v 表示，初速度用 v0 表示，位移用 x 表示，所需时间用 t 表示，则它们之间满足如下关系：</p>
                  <figure class="highlight excel">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line">x = <span class="symbol">v0</span> * <span class="built_in">t</span> + <span class="number">0.5</span> * a * <span class="built_in">t</span> * <span class="built_in">t</span> </span><br><span class="line">v = <span class="symbol">v0</span> + a * <span class="built_in">t</span></span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>接下来我们利用两个公式可以构造一个轨迹移动算法，计算出先加速后减速的运动轨迹，代码实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_track</span><span class="params">(self, distance)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    根据偏移量获取移动轨迹</span></span><br><span class="line"><span class="string">    :param distance: 偏移量</span></span><br><span class="line"><span class="string">    :return: 移动轨迹</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    <span class="comment"># 移动轨迹</span></span><br><span class="line">    track = []</span><br><span class="line">    <span class="comment"># 当前位移</span></span><br><span class="line">    current = <span class="number">0</span></span><br><span class="line">    <span class="comment"># 减速阈值</span></span><br><span class="line">    mid = distance * <span class="number">4</span> / <span class="number">5</span></span><br><span class="line">    <span class="comment"># 计算间隔</span></span><br><span class="line">    t = <span class="number">0.2</span></span><br><span class="line">    <span class="comment"># 初速度</span></span><br><span class="line">    v = <span class="number">0</span></span><br><span class="line"></span><br><span class="line">    <span class="keyword">while</span> current &lt; distance:</span><br><span class="line">        <span class="keyword">if</span> current &lt; mid:</span><br><span class="line">            <span class="comment"># 加速度为正2</span></span><br><span class="line">            a = <span class="number">2</span></span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            <span class="comment"># 加速度为负3</span></span><br><span class="line">            a = <span class="number">-3</span></span><br><span class="line">        <span class="comment"># 初速度v0</span></span><br><span class="line">        v0 = v</span><br><span class="line">        <span class="comment"># 当前速度v = v0 + at</span></span><br><span class="line">        v = v0 + a * t</span><br><span class="line">        <span class="comment"># 移动距离x = v0t + 1/2 * a * t^2</span></span><br><span class="line">        move = v0 * t + <span class="number">1</span> / <span class="number">2</span> * a * t * t</span><br><span class="line">        <span class="comment"># 当前位移</span></span><br><span class="line">        current += move</span><br><span class="line">        <span class="comment"># 加入轨迹</span></span><br><span class="line">        track.append(round(move))</span><br><span class="line">    <span class="keyword">return</span> track</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里我们定义了 get_track() 方法，传入的参数为移动的总距离，返回的是运动轨迹，用 track 表示，它是一个列表，列表的每个元素代表每次移动多少距离。 首先定义了一个变量 mid，即减速的阈值，也就是加速到什么位置就开始减速，在这里定义为 4/5，即模拟前 4/5 路程是加速过程，后 1/5 是减速过程。 随后定义了当前位移的距离变量 current，初始为 0，随后进入 while 循环，循环的条件是当前位移小于总距离。在循环里我们分段定义了加速度，其中加速过程加速度定义为2，减速过程加速度定义为 -3，随后再套用位移公式计算出某个时间段内的位移，同时将当前位移更新并记录到轨迹里即可。 这样直到运动轨迹达到总距离时即终止循环，最后得到的 track 即记录了每个时间间隔移动了多少位移，这样滑块的运动轨迹就得到了。 最后我们只需要按照该运动轨迹拖动滑块即可，方法实现如下：</p>
                  <figure class="highlight python">
                    <table>
                      <tr>
                        <td class="gutter">
                          <pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre>
                        </td>
                        <td class="code">
                          <pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">move_to_gap</span><span class="params">(self, slider, tracks)</span>:</span></span><br><span class="line">    <span class="string">"""</span></span><br><span class="line"><span class="string">    拖动滑块到缺口处</span></span><br><span class="line"><span class="string">    :param slider: 滑块</span></span><br><span class="line"><span class="string">    :param tracks: 轨迹</span></span><br><span class="line"><span class="string">    :return:</span></span><br><span class="line"><span class="string">    """</span></span><br><span class="line">    ActionChains(self.browser).click_and_hold(slider).perform()</span><br><span class="line">    <span class="keyword">for</span> x <span class="keyword">in</span> tracks:</span><br><span class="line">        ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=<span class="number">0</span>).perform()</span><br><span class="line">    time.sleep(<span class="number">0.5</span>)</span><br><span class="line">    ActionChains(self.browser).release().perform()</span><br></pre>
                        </td>
                      </tr>
                    </table>
                  </figure>
                  <p>在这里传入的参数为滑块对象和运动轨迹，首先调用ActionChains 的 click_and_hold() 方法按住拖动底部滑块，随后遍历运动轨迹获取每小段位移距离，调用 move_by_offset() 方法移动此位移，最后移动完成之后调用 release() 方法松开鼠标即可。 这样再经过测试，验证就通过了，识别完成，效果图 8-17 所示： <img src="https://qiniu.cuiqingcai.com/2019-08-02-055006.jpg" alt=""> 图 8-17 识别成功结果 最后，我们只需要将表单完善，模拟点击登录按钮即可完成登录，成功登录后即跳转到后台。 至此，极验验证码的识别工作即全部完成，此识别方法同样适用于其他使用极验3.0的网站，原理都是相同的。</p>
                  <h3 id="10-本节代码"><a href="#10-本节代码" class="headerlink" title="10. 本节代码"></a>10. 本节代码</h3>
                  <p>本节代码地址为：<a href="https://github.com/Python3WebSpider/CrackGeetest" target="_blank" rel="noopener">https://github.com/Python3WebSpider/CrackGeetest</a>。</p>
                  <h3 id="11-结语"><a href="#11-结语" class="headerlink" title="11. 结语"></a>11. 结语</h3>
                  <p>本节我们分析并实现了极验验证码的识别，其关键在于识别的思路，如怎样识别缺口位置，怎样生成运动轨迹等，学会了这些思路后以后我们再遇到类似原理的验证码同样可以完成识别过程。</p>
                  </p>
                </div>
              </div>
              <div class="post-meta">
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-user"></i>
                  </span>
                  <span class="post-meta-item-text">作者</span>
                  <span><a href="/authors/崔庆才" class="author" itemprop="url" rel="index">崔庆才</a></span>
                </span>
                <span class="post-meta-item">
                  <span class="post-meta-item-icon">
                    <i class="far fa-calendar"></i>
                  </span>
                  <span class="post-meta-item-text">发表于</span>
                  <time title="创建时间：2019-08-02 13:51:17" itemprop="dateCreated datePublished" datetime="2019-08-02T13:51:17+08:00">2019-08-02</time>
                </span>
                <span id="/7037.html" class="post-meta-item leancloud_visitors" data-flag-title="[Python3网络爬虫开发实战] 8.2-极验滑动验证码的识别" title="阅读次数">
                  <span class="post-meta-item-icon">
                    <i class="fa fa-eye"></i>
                  </span>
                  <span class="post-meta-item-text">阅读次数：</span>
                  <span class="leancloud-visitors-count"></span>
                </span>
                <span class="post-meta-item" title="本文字数">
                  <span class="post-meta-item-icon">
                    <i class="far fa-file-word"></i>
                  </span>
                  <span class="post-meta-item-text">本文字数：</span>
                  <span>7.5k</span>
                </span>
                <span class="post-meta-item" title="阅读时长">
                  <span class="post-meta-item-icon">
                    <i class="far fa-clock"></i>
                  </span>
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                  <span>7 分钟</span>
                </span>
              </div>
            </article>
            <script>
              document.querySelectorAll('.random').forEach(item => item.src = "https://picsum.photos/id/" + Math.floor(Math.random() * Math.floor(300)) + "/200/133")

            </script>
            <nav class="pagination">
              <a class="extend prev" rel="prev" href="/page/11/"><i class="fa fa-angle-left" aria-label="上一页"></i></a><a class="page-number" href="/">1</a><span class="space">&hellip;</span><a class="page-number" href="/page/11/">11</a><span class="page-number current">12</span><a class="page-number" href="/page/13/">13</a><span class="space">&hellip;</span><a class="page-number" href="/page/31/">31</a><a class="extend next" rel="next" href="/page/13/"><i class="fa fa-angle-right" aria-label="下一页"></i></a>
            </nav>
          </div>
          <script>
            window.addEventListener('tabs:register', () =>
            {
              let
              {
                activeClass
              } = CONFIG.comments;
              if (CONFIG.comments.storage)
              {
                activeClass = localStorage.getItem('comments_active') || activeClass;
              }
              if (activeClass)
              {
                let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
                if (activeTab)
                {
                  activeTab.click();
                }
              }
            });
            if (CONFIG.comments.storage)
            {
              window.addEventListener('tabs:click', event =>
              {
                if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
                let commentClass = event.target.classList[1];
                localStorage.setItem('comments_active', commentClass);
              });
            }

          </script>
        </div>
        <div class="toggle sidebar-toggle">
          <span class="toggle-line toggle-line-first"></span>
          <span class="toggle-line toggle-line-middle"></span>
          <span class="toggle-line toggle-line-last"></span>
        </div>
        <aside class="sidebar">
          <div class="sidebar-inner">
            <ul class="sidebar-nav motion-element">
              <li class="sidebar-nav-toc"> 文章目录 </li>
              <li class="sidebar-nav-overview"> 站点概览 </li>
            </ul>
            <!--noindex-->
            <div class="post-toc-wrap sidebar-panel">
            </div>
            <!--/noindex-->
            <div class="site-overview-wrap sidebar-panel">
              <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
                <img class="site-author-image" itemprop="image" alt="崔庆才" src="/images/avatar.png">
                <p class="site-author-name" itemprop="name">崔庆才</p>
                <div class="site-description" itemprop="description">崔庆才的个人站点，记录生活的瞬间，分享学习的心得。</div>
              </div>
              <div class="site-state-wrap motion-element">
                <nav class="site-state">
                  <div class="site-state-item site-state-posts">
                    <a href="/archives/">
                      <span class="site-state-item-count">608</span>
                      <span class="site-state-item-name">日志</span>
                    </a>
                  </div>
                  <div class="site-state-item site-state-categories">
                    <a href="/categories/">
                      <span class="site-state-item-count">24</span>
                      <span class="site-state-item-name">分类</span></a>
                  </div>
                  <div class="site-state-item site-state-tags">
                    <a href="/tags/">
                      <span class="site-state-item-count">156</span>
                      <span class="site-state-item-name">标签</span></a>
                  </div>
                </nav>
              </div>
              <div class="links-of-author motion-element">
                <span class="links-of-author-item">
                  <a href="https://github.com/Germey" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;Germey" rel="noopener" target="_blank"><i class="fab fa-github fa-fw"></i>GitHub</a>
                </span>
                <span class="links-of-author-item">
                  <a href="mailto:cqc@cuiqingcai.com.com" title="邮件 → mailto:cqc@cuiqingcai.com.com" rel="noopener" target="_blank"><i class="fa fa-envelope fa-fw"></i>邮件</a>
                </span>
                <span class="links-of-author-item">
                  <a href="https://weibo.com/cuiqingcai" title="微博 → https:&#x2F;&#x2F;weibo.com&#x2F;cuiqingcai" rel="noopener" target="_blank"><i class="fab fa-weibo fa-fw"></i>微博</a>
                </span>
                <span class="links-of-author-item">
                  <a href="https://www.zhihu.com/people/Germey" title="知乎 → https:&#x2F;&#x2F;www.zhihu.com&#x2F;people&#x2F;Germey" rel="noopener" target="_blank"><i class="fa fa-magic fa-fw"></i>知乎</a>
                </span>
              </div>
            </div>
            <div style=" width: 100%;" class="sidebar-panel sidebar-panel-image sidebar-panel-active">
              <a href="https://tutorial.lengyue.video/?coupon=12ef4b1a-a3db-11ea-bb37-0242ac130002_cqx_850" target="_blank" rel="noopener">
                <img src="https://qiniu.cuiqingcai.com/bco2a.png" style=" width: 100%;">
              </a>
            </div>
            <div style=" width: 100%;" class="sidebar-panel sidebar-panel-image sidebar-panel-active">
              <a href="http://www.ipidea.net/?utm-source=cqc&utm-keyword=?cqc" target="_blank" rel="noopener">
                <img src="https://qiniu.cuiqingcai.com/0ywun.png" style=" width: 100%;">
              </a>
            </div>
            <div class="sidebar-panel sidebar-panel-tags sidebar-panel-active">
              <h4 class="name"> 标签云 </h4>
              <div class="content">
                <a href="/tags/2048/" style="font-size: 10px;">2048</a> <a href="/tags/API/" style="font-size: 10px;">API</a> <a href="/tags/Bootstrap/" style="font-size: 11.25px;">Bootstrap</a> <a href="/tags/CDN/" style="font-size: 10px;">CDN</a> <a href="/tags/CQC/" style="font-size: 10px;">CQC</a> <a href="/tags/CSS/" style="font-size: 10px;">CSS</a> <a href="/tags/CSS-%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 10px;">CSS 反爬虫</a> <a href="/tags/CV/" style="font-size: 10px;">CV</a> <a href="/tags/Django/" style="font-size: 10px;">Django</a> <a href="/tags/Eclipse/" style="font-size: 11.25px;">Eclipse</a> <a href="/tags/FTP/" style="font-size: 10px;">FTP</a> <a href="/tags/Git/" style="font-size: 10px;">Git</a> <a href="/tags/GitHub/" style="font-size: 13.75px;">GitHub</a> <a href="/tags/HTML5/" style="font-size: 10px;">HTML5</a> <a href="/tags/Hexo/" style="font-size: 10px;">Hexo</a> <a href="/tags/IT/" style="font-size: 10px;">IT</a> <a href="/tags/JSP/" style="font-size: 10px;">JSP</a> <a href="/tags/JavaScript/" style="font-size: 10px;">JavaScript</a> <a href="/tags/K8s/" style="font-size: 10px;">K8s</a> <a href="/tags/LOGO/" style="font-size: 10px;">LOGO</a> <a href="/tags/Linux/" style="font-size: 10px;">Linux</a> <a href="/tags/MIUI/" style="font-size: 10px;">MIUI</a> <a href="/tags/MongoDB/" style="font-size: 10px;">MongoDB</a> <a href="/tags/Mysql/" style="font-size: 10px;">Mysql</a> <a href="/tags/NBA/" style="font-size: 10px;">NBA</a> <a href="/tags/PHP/" style="font-size: 11.25px;">PHP</a> <a href="/tags/PS/" style="font-size: 10px;">PS</a> <a href="/tags/Pathlib/" style="font-size: 10px;">Pathlib</a> <a href="/tags/PhantomJS/" style="font-size: 10px;">PhantomJS</a> <a href="/tags/Python/" style="font-size: 15px;">Python</a> <a href="/tags/Python3/" style="font-size: 12.5px;">Python3</a> <a href="/tags/Pythonic/" style="font-size: 10px;">Pythonic</a> <a href="/tags/QQ/" style="font-size: 10px;">QQ</a> <a href="/tags/Redis/" style="font-size: 10px;">Redis</a> <a href="/tags/SAE/" style="font-size: 10px;">SAE</a> <a href="/tags/SSH/" style="font-size: 10px;">SSH</a> <a href="/tags/SVG/" style="font-size: 10px;">SVG</a> <a href="/tags/Scrapy/" style="font-size: 10px;">Scrapy</a> <a href="/tags/Scrapy-redis/" style="font-size: 10px;">Scrapy-redis</a> <a href="/tags/Scrapy%E5%88%86%E5%B8%83%E5%BC%8F/" style="font-size: 10px;">Scrapy分布式</a> <a href="/tags/Selenium/" style="font-size: 10px;">Selenium</a> <a href="/tags/TKE/" style="font-size: 10px;">TKE</a> <a href="/tags/Ubuntu/" style="font-size: 11.25px;">Ubuntu</a> <a href="/tags/VS-Code/" style="font-size: 10px;">VS Code</a> <a href="/tags/Vs-Code/" style="font-size: 10px;">Vs Code</a> <a href="/tags/Vue/" style="font-size: 11.25px;">Vue</a> <a href="/tags/Webpack/" style="font-size: 10px;">Webpack</a> <a href="/tags/Windows/" style="font-size: 10px;">Windows</a> <a href="/tags/Winpcap/" style="font-size: 10px;">Winpcap</a> <a href="/tags/WordPress/" style="font-size: 13.75px;">WordPress</a> <a href="/tags/Youtube/" style="font-size: 11.25px;">Youtube</a> <a href="/tags/android/" style="font-size: 10px;">android</a> <a href="/tags/ansible/" style="font-size: 10px;">ansible</a> <a href="/tags/cocos2d-x/" style="font-size: 10px;">cocos2d-x</a> <a href="/tags/e6/" style="font-size: 10px;">e6</a> <a href="/tags/fitvids/" style="font-size: 10px;">fitvids</a> <a href="/tags/git/" style="font-size: 11.25px;">git</a> <a href="/tags/json/" style="font-size: 10px;">json</a> <a href="/tags/js%E9%80%86%E5%90%91/" style="font-size: 10px;">js逆向</a> <a href="/tags/kubernetes/" style="font-size: 10px;">kubernetes</a> <a href="/tags/log/" style="font-size: 10px;">log</a> <a href="/tags/logging/" style="font-size: 10px;">logging</a> <a href="/tags/matlab/" style="font-size: 11.25px;">matlab</a> <a href="/tags/python/" style="font-size: 20px;">python</a> <a href="/tags/pytube/" style="font-size: 11.25px;">pytube</a> <a href="/tags/pywin32/" style="font-size: 10px;">pywin32</a> <a href="/tags/style/" style="font-size: 10px;">style</a> <a href="/tags/tomcat/" style="font-size: 10px;">tomcat</a> <a href="/tags/ubuntu/" style="font-size: 10px;">ubuntu</a> <a href="/tags/uwsgi/" style="font-size: 10px;">uwsgi</a> <a href="/tags/vsftpd/" style="font-size: 10px;">vsftpd</a> <a href="/tags/wamp/" style="font-size: 10px;">wamp</a> <a href="/tags/wineQQ/" style="font-size: 10px;">wineQQ</a> <a href="/tags/%E4%B8%83%E7%89%9B/" style="font-size: 11.25px;">七牛</a> <a href="/tags/%E4%B8%8A%E6%B5%B7/" style="font-size: 10px;">上海</a> <a href="/tags/%E4%B8%AA%E4%BA%BA%E7%BD%91%E7%AB%99/" style="font-size: 10px;">个人网站</a> <a href="/tags/%E4%B8%BB%E9%A2%98/" style="font-size: 10px;">主题</a> <a href="/tags/%E4%BA%91%E4%BA%A7%E5%93%81/" style="font-size: 10px;">云产品</a> <a href="/tags/%E4%BA%91%E5%AD%98%E5%82%A8/" style="font-size: 10px;">云存储</a> <a href="/tags/%E4%BA%AC%E4%B8%9C%E4%BA%91/" style="font-size: 10px;">京东云</a> <a href="/tags/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD/" style="font-size: 12.5px;">人工智能</a> <a href="/tags/%E4%BB%A3%E7%90%86/" style="font-size: 10px;">代理</a> <a href="/tags/%E4%BB%A3%E7%A0%81/" style="font-size: 10px;">代码</a> <a href="/tags/%E4%BB%A3%E7%A0%81%E5%88%86%E4%BA%AB%E5%9B%BE/" style="font-size: 10px;">代码分享图</a> <a href="/tags/%E4%BC%98%E5%8C%96/" style="font-size: 10px;">优化</a> <a href="/tags/%E4%BD%8D%E8%BF%90%E7%AE%97/" style="font-size: 10px;">位运算</a> <a href="/tags/%E5%85%AC%E4%BC%97%E5%8F%B7/" style="font-size: 10px;">公众号</a> <a href="/tags/%E5%88%86%E4%BA%AB/" style="font-size: 10px;">分享</a> <a href="/tags/%E5%88%86%E5%B8%83%E5%BC%8F/" style="font-size: 10px;">分布式</a> <a href="/tags/%E5%88%9B%E4%B8%9A/" style="font-size: 10px;">创业</a> <a href="/tags/%E5%89%8D%E7%AB%AF/" style="font-size: 12.5px;">前端</a> <a href="/tags/%E5%8D%9A%E5%AE%A2/" style="font-size: 10px;">博客</a> <a href="/tags/%E5%8E%9F%E7%94%9FAPP/" style="font-size: 10px;">原生APP</a> <a href="/tags/%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 12.5px;">反爬虫</a> <a href="/tags/%E5%91%BD%E4%BB%A4/" style="font-size: 10px;">命令</a> <a href="/tags/%E5%93%8D%E5%BA%94%E5%BC%8F%E5%B8%83%E5%B1%80/" style="font-size: 10px;">响应式布局</a> <a href="/tags/%E5%9E%83%E5%9C%BE%E9%82%AE%E4%BB%B6/" style="font-size: 10px;">垃圾邮件</a> <a href="/tags/%E5%9F%9F%E5%90%8D%E7%BB%91%E5%AE%9A/" style="font-size: 10px;">域名绑定</a> <a href="/tags/%E5%A4%8D%E7%9B%98/" style="font-size: 10px;">复盘</a> <a href="/tags/%E5%A4%A7%E4%BC%97%E7%82%B9%E8%AF%84/" style="font-size: 10px;">大众点评</a> <a href="/tags/%E5%AD%97%E4%BD%93%E5%8F%8D%E7%88%AC%E8%99%AB/" style="font-size: 10px;">字体反爬虫</a> <a href="/tags/%E5%AD%97%E7%AC%A6%E9%97%AE%E9%A2%98/" style="font-size: 10px;">字符问题</a> <a href="/tags/%E5%AD%A6%E4%B9%A0%E6%96%B9%E6%B3%95/" style="font-size: 10px;">学习方法</a> <a href="/tags/%E5%AE%89%E5%8D%93/" style="font-size: 10px;">安卓</a> <a href="/tags/%E5%AE%9E%E7%94%A8/" style="font-size: 10px;">实用</a> <a href="/tags/%E5%B0%81%E9%9D%A2/" style="font-size: 10px;">封面</a> <a href="/tags/%E5%B4%94%E5%BA%86%E6%89%8D/" style="font-size: 18.75px;">崔庆才</a> <a href="/tags/%E5%B7%A5%E5%85%B7/" style="font-size: 12.5px;">工具</a> <a href="/tags/%E5%BC%80%E5%8F%91%E5%B7%A5%E5%85%B7/" style="font-size: 10px;">开发工具</a> <a href="/tags/%E5%BE%AE%E8%BD%AF/" style="font-size: 10px;">微软</a> <a href="/tags/%E6%80%9D%E8%80%83/" style="font-size: 10px;">思考</a> <a href="/tags/%E6%89%8B%E6%9C%BA%E8%AE%BF%E9%97%AE/" style="font-size: 10px;">手机访问</a> <a href="/tags/%E6%95%99%E7%A8%8B/" style="font-size: 10px;">教程</a> <a href="/tags/%E6%95%99%E8%82%B2/" style="font-size: 10px;">教育</a> <a href="/tags/%E6%96%B0%E4%B9%A6/" style="font-size: 12.5px;">新书</a> <a href="/tags/%E6%96%B9%E6%B3%95%E8%AE%BA/" style="font-size: 10px;">方法论</a> <a href="/tags/%E6%97%85%E6%B8%B8/" style="font-size: 10px;">旅游</a> <a href="/tags/%E6%97%A5%E5%BF%97/" style="font-size: 10px;">日志</a> <a href="/tags/%E6%9A%97%E6%97%B6%E9%97%B4/" style="font-size: 10px;">暗时间</a> <a href="/tags/%E6%9D%9C%E5%85%B0%E7%89%B9/" style="font-size: 11.25px;">杜兰特</a> <a href="/tags/%E6%A1%8C%E9%9D%A2/" style="font-size: 10px;">桌面</a> <a href="/tags/%E6%AD%8C%E5%8D%95/" style="font-size: 10px;">歌单</a> <a href="/tags/%E6%B1%9F%E5%8D%97/" style="font-size: 10px;">江南</a> <a href="/tags/%E6%B8%B8%E6%88%8F/" style="font-size: 10px;">游戏</a> <a href="/tags/%E7%84%A6%E8%99%91/" style="font-size: 10px;">焦虑</a> <a href="/tags/%E7%88%AC%E8%99%AB/" style="font-size: 16.25px;">爬虫</a> <a href="/tags/%E7%88%AC%E8%99%AB%E4%B9%A6%E7%B1%8D/" style="font-size: 11.25px;">爬虫书籍</a> <a href="/tags/%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F/" style="font-size: 10px;">环境变量</a> <a href="/tags/%E7%94%9F%E6%B4%BB%E7%AC%94%E8%AE%B0/" style="font-size: 10px;">生活笔记</a> <a href="/tags/%E7%99%BB%E5%BD%95/" style="font-size: 10px;">登录</a> <a href="/tags/%E7%9F%A5%E4%B9%8E/" style="font-size: 10px;">知乎</a> <a href="/tags/%E7%9F%AD%E4%BF%A1/" style="font-size: 10px;">短信</a> <a href="/tags/%E7%9F%AD%E4%BF%A1%E9%AA%8C%E8%AF%81%E7%A0%81/" style="font-size: 10px;">短信验证码</a> <a href="/tags/%E7%AC%94%E8%AE%B0%E8%BD%AF%E4%BB%B6/" style="font-size: 10px;">笔记软件</a> <a href="/tags/%E7%AF%AE%E7%BD%91/" style="font-size: 10px;">篮网</a> <a href="/tags/%E7%BA%B8%E5%BC%A0/" style="font-size: 10px;">纸张</a> <a href="/tags/%E7%BB%84%E4%BB%B6/" style="font-size: 10px;">组件</a> <a href="/tags/%E7%BD%91%E7%AB%99/" style="font-size: 10px;">网站</a> <a href="/tags/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/" style="font-size: 11.25px;">网络爬虫</a> <a href="/tags/%E7%BE%8E%E5%AD%A6/" style="font-size: 10px;">美学</a> <a href="/tags/%E8%82%89%E5%A4%B9%E9%A6%8D/" style="font-size: 10px;">肉夹馍</a> <a href="/tags/%E8%85%BE%E8%AE%AF%E4%BA%91/" style="font-size: 10px;">腾讯云</a> <a href="/tags/%E8%87%AA%E5%BE%8B/" style="font-size: 10px;">自律</a> <a href="/tags/%E8%A5%BF%E5%B0%91%E7%88%B7/" style="font-size: 10px;">西少爷</a> <a href="/tags/%E8%A7%86%E9%A2%91/" style="font-size: 10px;">视频</a> <a href="/tags/%E8%B0%B7%E6%AD%8C%E9%AA%8C%E8%AF%81%E7%A0%81/" style="font-size: 10px;">谷歌验证码</a> <a href="/tags/%E8%BF%90%E8%90%A5/" style="font-size: 10px;">运营</a> <a href="/tags/%E8%BF%9C%E7%A8%8B/" style="font-size: 10px;">远程</a> <a href="/tags/%E9%80%86%E5%90%91/" style="font-size: 10px;">逆向</a> <a href="/tags/%E9%85%8D%E7%BD%AE/" style="font-size: 10px;">配置</a> <a href="/tags/%E9%87%8D%E8%A3%85/" style="font-size: 10px;">重装</a> <a href="/tags/%E9%98%BF%E6%9D%9C/" style="font-size: 10px;">阿杜</a> <a href="/tags/%E9%9D%99%E8%A7%85/" style="font-size: 17.5px;">静觅</a> <a href="/tags/%E9%A2%A0%E8%A6%86/" style="font-size: 10px;">颠覆</a> <a href="/tags/%E9%A3%9E%E4%BF%A1/" style="font-size: 10px;">飞信</a> <a href="/tags/%E9%B8%BF%E8%92%99/" style="font-size: 10px;">鸿蒙</a>
              </div>
              <script>
                const tagsColors = ['#00a67c', '#5cb85c', '#d9534f', '#567e95', '#b37333', '#f4843d', '#15a287']
                const tagsElements = document.querySelectorAll('.sidebar-panel-tags .content a')
                tagsElements.forEach((item) =>
                {
                  item.style.backgroundColor = tagsColors[Math.floor(Math.random() * tagsColors.length)]
                })

              </script>
            </div>
            <div class="sidebar-panel sidebar-panel-categories sidebar-panel-active">
              <h4 class="name"> 分类 </h4>
              <div class="content">
                <ul class="category-list">
                  <li class="category-list-item"><a class="category-list-link" href="/categories/C-C/">C/C++</a><span class="category-list-count">23</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/HTML/">HTML</a><span class="category-list-count">14</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Java/">Java</a><span class="category-list-count">5</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/JavaScript/">JavaScript</a><span class="category-list-count">26</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Linux/">Linux</a><span class="category-list-count">15</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Markdown/">Markdown</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Net/">Net</a><span class="category-list-count">4</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Other/">Other</a><span class="category-list-count">39</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/PHP/">PHP</a><span class="category-list-count">27</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Paper/">Paper</a><span class="category-list-count">2</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/Python/">Python</a><span class="category-list-count">261</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/TypeScript/">TypeScript</a><span class="category-list-count">2</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E5%B1%95%E7%A4%BA/">个人展示</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E6%97%A5%E8%AE%B0/">个人日记</a><span class="category-list-count">9</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E8%AE%B0%E5%BD%95/">个人记录</a><span class="category-list-count">4</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E4%B8%AA%E4%BA%BA%E9%9A%8F%E7%AC%94/">个人随笔</a><span class="category-list-count">15</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE/">安装配置</a><span class="category-list-count">59</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E6%8A%80%E6%9C%AF%E6%9D%82%E8%B0%88/">技术杂谈</a><span class="category-list-count">88</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E6%9C%AA%E5%88%86%E7%B1%BB/">未分类</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E7%94%9F%E6%B4%BB%E7%AC%94%E8%AE%B0/">生活笔记</a><span class="category-list-count">1</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E7%A6%8F%E5%88%A9%E4%B8%93%E5%8C%BA/">福利专区</a><span class="category-list-count">6</span></li>
                  <li class="category-list-item"><a class="category-list-link" href="/categories/%E8%81%8C%E4%BD%8D%E6%8E%A8%E8%8D%90/">职位推荐</a><span class="category-list-count">2</span></li>
                </ul>
              </div>
            </div>
            <div class="sidebar-panel sidebar-panel-friends sidebar-panel-active">
              <h4 class="name"> 友情链接 </h4>
              <ul class="friends">
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/j2dub.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.findhao.net/" target="_blank" rel="noopener">FindHao</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ou6mm.jpg">
                  </span>
                  <span class="link">
                    <a href="https://diygod.me/" target="_blank" rel="noopener">DIYgod</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/6apxu.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.51dev.com/" target="_blank" rel="noopener">IT技术社区</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.jankl.com/img/titleshu.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.jankl.com/" target="_blank" rel="noopener">liberalist</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/bqlbs.png">
                  </span>
                  <span class="link">
                    <a href="http://www.urselect.com/" target="_blank" rel="noopener">优社电商</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/8s88c.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.yuanrenxue.com/" target="_blank" rel="noopener">猿人学</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/2wgg5.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.yunlifang.cn/" target="_blank" rel="noopener">云立方</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/shwr6.png">
                  </span>
                  <span class="link">
                    <a href="http://lanbing510.info/" target="_blank" rel="noopener">冰蓝</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/blvoh.jpg">
                  </span>
                  <span class="link">
                    <a href="https://lengyue.me/" target="_blank" rel="noopener">冷月</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="http://qianxunclub.com/favicon.png">
                  </span>
                  <span class="link">
                    <a href="http://qianxunclub.com/" target="_blank" rel="noopener">千寻啊千寻</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/0044u.jpg">
                  </span>
                  <span class="link">
                    <a href="http://kodcloud.com/" target="_blank" rel="noopener">可道云</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ygnpn.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.kunkundashen.cn/" target="_blank" rel="noopener">坤坤大神</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/22uv1.png">
                  </span>
                  <span class="link">
                    <a href="http://www.cenchong.com/" target="_blank" rel="noopener">岑冲博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ev9kl.png">
                  </span>
                  <span class="link">
                    <a href="http://www.zxiaoji.com/" target="_blank" rel="noopener">张小鸡</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.503error.com/favicon.ico">
                  </span>
                  <span class="link">
                    <a href="https://www.503error.com/" target="_blank" rel="noopener">张志明个人博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/x714o.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.hubwiz.com/" target="_blank" rel="noopener">汇智网</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/129d8.png">
                  </span>
                  <span class="link">
                    <a href="https://www.bysocket.com/" target="_blank" rel="noopener">泥瓦匠BYSocket</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://www.xiongge.club/favicon.ico">
                  </span>
                  <span class="link">
                    <a href="https://www.xiongge.club/" target="_blank" rel="noopener">熊哥club</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/3w4fe.png">
                  </span>
                  <span class="link">
                    <a href="https://zerlong.com/" target="_blank" rel="noopener">知语</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/44hxf.png">
                  </span>
                  <span class="link">
                    <a href="http://redstonewill.com/" target="_blank" rel="noopener">红色石头</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/8g1fk.jpg">
                  </span>
                  <span class="link">
                    <a href="http://www.laodong.me/" target="_blank" rel="noopener">老董博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/wkaus.jpg">
                  </span>
                  <span class="link">
                    <a href="https://zhaoshuai.me/" target="_blank" rel="noopener">碎念</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/pgo0r.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.chenwenguan.com/" target="_blank" rel="noopener">陈文管的博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/kk82a.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.lxlinux.net/" target="_blank" rel="noopener">良许Linux教程网</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/lj0t2.jpg">
                  </span>
                  <span class="link">
                    <a href="https://tanqingbo.cn/" target="_blank" rel="noopener">IT码农</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/i8cdr.png">
                  </span>
                  <span class="link">
                    <a href="https://junyiseo.com/" target="_blank" rel="noopener">均益个人博客</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/chwv2.png">
                  </span>
                  <span class="link">
                    <a href="https://brucedone.com/" target="_blank" rel="noopener">大鱼的鱼塘</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/2y43o.png">
                  </span>
                  <span class="link">
                    <a href="http://bbs.nightteam.cn/" target="_blank" rel="noopener">夜幕爬虫安全论坛</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/zvc3w.jpg">
                  </span>
                  <span class="link">
                    <a href="https://www.weishidong.com/" target="_blank" rel="noopener">韦世东的技术专栏</a>
                  </span>
                </li>
                <li class="friend">
                  <span class="logo">
                    <img src="https://qiniu.cuiqingcai.com/ebudy.jpg">
                  </span>
                  <span class="link">
                    <a href="https://chuanjiabing.com/" target="_blank" rel="noopener">穿甲兵技术社区</a>
                  </span>
                </li>
              </ul>
            </div>
          </div>
        </aside>
        <div id="sidebar-dimmer"></div>
      </div>
    </main>
    <footer class="footer">
      <div class="footer-inner">
        <div class="copyright"> &copy; <span itemprop="copyrightYear">2021</span>
          <span class="with-love">
            <i class="fa fa-heart"></i>
          </span>
          <span class="author" itemprop="copyrightHolder">崔庆才丨静觅</span>
          <span class="post-meta-divider">|</span>
          <span class="post-meta-item-icon">
            <i class="fa fa-chart-area"></i>
          </span>
          <span title="站点总字数">2.6m</span>
          <span class="post-meta-divider">|</span>
          <span class="post-meta-item-icon">
            <i class="fa fa-coffee"></i>
          </span>
          <span title="站点阅读时长">39:54</span>
        </div>
        <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动 </div>
        <div class="beian"><a href="https://beian.miit.gov.cn/" rel="noopener" target="_blank">京ICP备18015597号-1 </a>
        </div>
        <script>
          (function ()
          {
            function leancloudSelector(url)
            {
              url = encodeURI(url);
              return document.getElementById(url).querySelector('.leancloud-visitors-count');
            }

            function addCount(Counter)
            {
              var visitors = document.querySelector('.leancloud_visitors');
              var url = decodeURI(visitors.id);
              var title = visitors.dataset.flagTitle;
              Counter('get', '/classes/Counter?where=' + encodeURIComponent(JSON.stringify(
              {
                url
              }))).then(response => response.json()).then((
              {
                results
              }) =>
              {
                if (results.length > 0)
                {
                  var counter = results[0];
                  leancloudSelector(url).innerText = counter.time + 1;
                  Counter('put', '/classes/Counter/' + counter.objectId,
                  {
                    time:
                    {
                      '__op': 'Increment',
                      'amount': 1
                    }
                  }).catch(error =>
                  {
                    console.error('Failed to save visitor count', error);
                  });
                }
                else
                {
                  Counter('post', '/classes/Counter',
                  {
                    title,
                    url,
                    time: 1
                  }).then(response => response.json()).then(() =>
                  {
                    leancloudSelector(url).innerText = 1;
                  }).catch(error =>
                  {
                    console.error('Failed to create', error);
                  });
                }
              }).catch(error =>
              {
                console.error('LeanCloud Counter Error', error);
              });
            }

            function showTime(Counter)
            {
              var visitors = document.querySelectorAll('.leancloud_visitors');
              var entries = [...visitors].map(element =>
              {
                return decodeURI(element.id);
              });
              Counter('get', '/classes/Counter?where=' + encodeURIComponent(JSON.stringify(
              {
                url:
                {
                  '$in': entries
                }
              }))).then(response => response.json()).then((
              {
                results
              }) =>
              {
                for (let url of entries)
                {
                  let target = results.find(item => item.url === url);
                  leancloudSelector(url).innerText = target ? target.time : 0;
                }
              }).catch(error =>
              {
                console.error('LeanCloud Counter Error', error);
              });
            }
            let
            {
              app_id,
              app_key,
              server_url
            } = {
              "enable": true,
              "app_id": "6X5dRQ0pnPWJgYy8SXOg0uID-gzGzoHsz",
              "app_key": "ziLDVEy73ne5HtFTiGstzHMS",
              "server_url": "https://6x5drq0p.lc-cn-n1-shared.com",
              "security": false
            };

            function fetchData(api_server)
            {
              var Counter = (method, url, data) =>
              {
                return fetch(`${api_server}/1.1${url}`,
                {
                  method,
                  headers:
                  {
                    'X-LC-Id': app_id,
                    'X-LC-Key': app_key,
                    'Content-Type': 'application/json',
                  },
                  body: JSON.stringify(data)
                });
              };
              if (CONFIG.page.isPost)
              {
                if (CONFIG.hostname !== location.hostname) return;
                addCount(Counter);
              }
              else if (document.querySelectorAll('.post-title-link').length >= 1)
              {
                showTime(Counter);
              }
            }
            let api_server = app_id.slice(-9) !== '-MdYXbMMI' ? server_url : `https://${app_id.slice(0, 8).toLowerCase()}.api.lncldglobal.com`;
            if (api_server)
            {
              fetchData(api_server);
            }
            else
            {
              fetch('https://app-router.leancloud.cn/2/route?appId=' + app_id).then(response => response.json()).then((
              {
                api_server
              }) =>
              {
                fetchData('https://' + api_server);
              });
            }
          })();

        </script>
      </div>
      <div class="footer-stat">
        <span id="cnzz_stat_icon_1279355174"></span>
        <script type="text/javascript">
          document.write(unescape("%3Cspan id='cnzz_stat_icon_1279355174'%3E%3C/span%3E%3Cscript src='https://v1.cnzz.com/z_stat.php%3Fid%3D1279355174%26online%3D1%26show%3Dline' type='text/javascript'%3E%3C/script%3E"));

        </script>
      </div>
    </footer>
  </div>
  <script src="//cdn.jsdelivr.net/npm/animejs@3.2.1/lib/anime.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/pangu@4/dist/browser/pangu.min.js"></script>
  <script src="/js/utils.js"></script>
  <script src="/.js"></script>
  <script src="/js/schemes/pisces.js"></script>
  <script src="/.js"></script>
  <script src="/js/next-boot.js"></script>
  <script src="/.js"></script>
  <script>
    (function ()
    {
      var canonicalURL, curProtocol;
      //Get the <link> tag
      var x = document.getElementsByTagName("link");
      //Find the last canonical URL
      if (x.length > 0)
      {
        for (i = 0; i < x.length; i++)
        {
          if (x[i].rel.toLowerCase() == 'canonical' && x[i].href)
          {
            canonicalURL = x[i].href;
          }
        }
      }
      //Get protocol
      if (!canonicalURL)
      {
        curProtocol = window.location.protocol.split(':')[0];
      }
      else
      {
        curProtocol = canonicalURL.split(':')[0];
      }
      //Get current URL if the canonical URL does not exist
      if (!canonicalURL) canonicalURL = window.location.href;
      //Assign script content. Replace current URL with the canonical URL
      ! function ()
      {
        var e = /([http|https]:\/\/[a-zA-Z0-9\_\.]+\.baidu\.com)/gi,
          r = canonicalURL,
          t = document.referrer;
        if (!e.test(r))
        {
          var n = (String(curProtocol).toLowerCase() === 'https') ? "https://sp0.baidu.com/9_Q4simg2RQJ8t7jm9iCKT-xh_/s.gif" : "//api.share.baidu.com/s.gif";
          t ? (n += "?r=" + encodeURIComponent(document.referrer), r && (n += "&l=" + r)) : r && (n += "?l=" + r);
          var i = new Image;
          i.src = n
        }
      }(window);
    })();

  </script>
  <script src="/js/local-search.js"></script>
  <script src="/.js"></script>
</body>

</html>
